From 105923088e189ee868c57f46c7e21173c5d74738 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 21 Dec 2022 14:17:46 +0800 Subject: [PATCH 001/231] Initial commit --- LICENSE | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000000..f3a65abd261 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +BSD 2-Clause License + +Copyright (c) 2022, ZhengYu, Xu + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 516b35fd91ca65821b869124942c89d0ed50ba0d Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 21 Dec 2022 14:49:35 +0800 Subject: [PATCH 002/231] init project --- .gitignore | 137 ++++++++++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 36 +++++++++++ README.md | 0 pyproject.toml | 52 +++++++++++++++ 4 files changed, 225 insertions(+) create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 README.md create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000000..f47c62c6efb --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# PyCharm project settings +.idea/ + +# VSCode project settings +.vscode/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +/poetry.lock +.idea/**/* \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000000..dd6ee48335e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,36 @@ +minimum_pre_commit_version: 2.15.0 +ci: + autofix_prs: false +repos: + - repo: https://github.com/python/black + rev: 22.12.0 + hooks: + - id: black + - repo: https://github.com/PyCQA/isort + rev: 5.11.2 + hooks: + - id: isort + - repo: https://github.com/asottile/pyupgrade + rev: v3.3.1 + hooks: + - id: pyupgrade + types_or: [python, pyi] + types: [text] # overwrite types: [python] + args: [--py38-plus] + - repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + name: flake8 (py) + types: [python] + args: [--ignore=E501 F841] + - id: flake8 + name: flake8 (pyi) + additional_dependencies: + - flake8-pyi==22.11.0 + types: [pyi] + args: [ + --ignore=E301 E302 E305 E402 E501 E701 E704 F401 F811 W503 Y019 Y027 Y034 Y037 Y041 Y042, + # TypeVars in private files are already private + --per-file-ignores=_*.pyi:Y001, + ] diff --git a/README.md b/README.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000000..47eb1ecca26 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,52 @@ +[tool.poetry] +name = "pyarrow-stubs" +version = "10.0.1.0a1" +description = "Type annotations for pyarrow" +authors = ["ZhengYu, Xu "] +license = "BSD-2-Clause" +readme = "README.md" +homepage = "https://github.com/zen-xu/pyarrow-stubs" +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: BSD License", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering", +] +packages = [{ include = "pyarrow-stubs" }] + +[tool.poetry.dependencies] +python = "^3.7,<3.11" + +[tool.poetry.dev-dependencies] +black = ">=22.12.0" +isort = ">=5.10.1" +numpy = "1.21.4" +mypy = "^0.991" +pre-commit = ">=2.19.0" +pyarrow = "10.0.1" +pyright = ">=1.1.284" +pytest = ">=7.1.2" +typing-extensions = ">=4.2.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.black] +target-version = ['py37'] + +[tool.isort] +profile = "black" +combine_as_imports = true +force_grid_wrap = 2 +force_sort_within_sections = true From 5738e6eeb08c0bb3b3155220c63997440d7c606e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 22 Dec 2022 12:19:23 +0800 Subject: [PATCH 003/231] complete most of the annotations --- pyarrow-stubs/__init__.pyi | 265 +++ pyarrow-stubs/_compute.pyi | 774 +++++++++ pyarrow-stubs/_compute_docstrings.pyi | 7 + pyarrow-stubs/_csv.pyi | 198 +++ pyarrow-stubs/_dataset.pyi | 398 +++++ pyarrow-stubs/_dataset_orc.pyi | 13 + pyarrow-stubs/_dataset_parquet.pyi | 110 ++ pyarrow-stubs/_exec_plan.pyi | 27 + pyarrow-stubs/_feather.pyi | 20 + pyarrow-stubs/_flight.pyi | 726 ++++++++ pyarrow-stubs/_fs.pyi | 262 +++ pyarrow-stubs/_gcsfs.pyi | 155 ++ pyarrow-stubs/_generated_version.pyi | 5 + pyarrow-stubs/_hdfs.pyi | 19 + pyarrow-stubs/_hdfsio.pyi | 70 + pyarrow-stubs/_json.pyi | 25 + pyarrow-stubs/_orc.pyi | 44 + pyarrow-stubs/_parquet.pyi | 295 ++++ pyarrow-stubs/_parquet_encryption.pyi | 111 ++ pyarrow-stubs/_plasma.pyi | 113 ++ pyarrow-stubs/_s3fs.pyi | 64 + pyarrow-stubs/_substrait.pyi | 16 + pyarrow-stubs/benchmark.pyi | 1 + pyarrow-stubs/cffi.pyi | 4 + pyarrow-stubs/compute.pyi | 130 ++ pyarrow-stubs/csv.pyi | 13 + pyarrow-stubs/cuda.pyi | 12 + pyarrow-stubs/dataset.pyi | 113 ++ pyarrow-stubs/feather.pyi | 67 + pyarrow-stubs/filesystem.pyi | 56 + pyarrow-stubs/flight.pyi | 47 + pyarrow-stubs/fs.pyi | 58 + pyarrow-stubs/hdfs.pyi | 33 + pyarrow-stubs/ipc.pyi | 102 ++ pyarrow-stubs/json.pyi | 5 + pyarrow-stubs/jvm.pyi | 19 + pyarrow-stubs/lib.pyi | 2229 +++++++++++++++++++++++++ pyarrow-stubs/orc.pyi | 99 ++ pyarrow-stubs/pandas_compat.pyi | 65 + pyarrow-stubs/parquet/__init__.pyi | 1 + pyarrow-stubs/parquet/core.pyi | 403 +++++ pyarrow-stubs/parquet/encryption.pyi | 7 + pyarrow-stubs/plasma.pyi | 28 + pyarrow-stubs/py.typed | 0 pyarrow-stubs/serialization.pyi | 18 + pyarrow-stubs/substrait.pyi | 4 + pyarrow-stubs/types.pyi | 53 + pyarrow-stubs/util.pyi | 12 + pyproject.toml | 3 +- 49 files changed, 7297 insertions(+), 2 deletions(-) create mode 100644 pyarrow-stubs/__init__.pyi create mode 100644 pyarrow-stubs/_compute.pyi create mode 100644 pyarrow-stubs/_compute_docstrings.pyi create mode 100644 pyarrow-stubs/_csv.pyi create mode 100644 pyarrow-stubs/_dataset.pyi create mode 100644 pyarrow-stubs/_dataset_orc.pyi create mode 100644 pyarrow-stubs/_dataset_parquet.pyi create mode 100644 pyarrow-stubs/_exec_plan.pyi create mode 100644 pyarrow-stubs/_feather.pyi create mode 100644 pyarrow-stubs/_flight.pyi create mode 100644 pyarrow-stubs/_fs.pyi create mode 100644 pyarrow-stubs/_gcsfs.pyi create mode 100644 pyarrow-stubs/_generated_version.pyi create mode 100644 pyarrow-stubs/_hdfs.pyi create mode 100644 pyarrow-stubs/_hdfsio.pyi create mode 100644 pyarrow-stubs/_json.pyi create mode 100644 pyarrow-stubs/_orc.pyi create mode 100644 pyarrow-stubs/_parquet.pyi create mode 100644 pyarrow-stubs/_parquet_encryption.pyi create mode 100644 pyarrow-stubs/_plasma.pyi create mode 100644 pyarrow-stubs/_s3fs.pyi create mode 100644 pyarrow-stubs/_substrait.pyi create mode 100644 pyarrow-stubs/benchmark.pyi create mode 100644 pyarrow-stubs/cffi.pyi create mode 100644 pyarrow-stubs/compute.pyi create mode 100644 pyarrow-stubs/csv.pyi create mode 100644 pyarrow-stubs/cuda.pyi create mode 100644 pyarrow-stubs/dataset.pyi create mode 100644 pyarrow-stubs/feather.pyi create mode 100644 pyarrow-stubs/filesystem.pyi create mode 100644 pyarrow-stubs/flight.pyi create mode 100644 pyarrow-stubs/fs.pyi create mode 100644 pyarrow-stubs/hdfs.pyi create mode 100644 pyarrow-stubs/ipc.pyi create mode 100644 pyarrow-stubs/json.pyi create mode 100644 pyarrow-stubs/jvm.pyi create mode 100644 pyarrow-stubs/lib.pyi create mode 100644 pyarrow-stubs/orc.pyi create mode 100644 pyarrow-stubs/pandas_compat.pyi create mode 100644 pyarrow-stubs/parquet/__init__.pyi create mode 100644 pyarrow-stubs/parquet/core.pyi create mode 100644 pyarrow-stubs/parquet/encryption.pyi create mode 100644 pyarrow-stubs/plasma.pyi create mode 100644 pyarrow-stubs/py.typed create mode 100644 pyarrow-stubs/serialization.pyi create mode 100644 pyarrow-stubs/substrait.pyi create mode 100644 pyarrow-stubs/types.pyi create mode 100644 pyarrow-stubs/util.pyi diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi new file mode 100644 index 00000000000..07069877eec --- /dev/null +++ b/pyarrow-stubs/__init__.pyi @@ -0,0 +1,265 @@ +from typing import Any + +from pyarrow._hdfsio import ( + HdfsFile as HdfsFile, + have_libhdfs as have_libhdfs, +) +from pyarrow.ipc import ( + Message as Message, + MessageReader as MessageReader, + MetadataVersion as MetadataVersion, + RecordBatchFileReader as RecordBatchFileReader, + RecordBatchFileWriter as RecordBatchFileWriter, + RecordBatchStreamReader as RecordBatchStreamReader, + RecordBatchStreamWriter as RecordBatchStreamWriter, + deserialize_pandas as deserialize_pandas, + serialize_pandas as serialize_pandas, +) +from pyarrow.lib import ( + NA as NA, + Array as Array, + ArrowCancelled as ArrowCancelled, + ArrowCapacityError as ArrowCapacityError, + ArrowException as ArrowException, + ArrowIndexError as ArrowIndexError, + ArrowInvalid as ArrowInvalid, + ArrowIOError as ArrowIOError, + ArrowKeyError as ArrowKeyError, + ArrowMemoryError as ArrowMemoryError, + ArrowNotImplementedError as ArrowNotImplementedError, + ArrowSerializationError as ArrowSerializationError, + ArrowTypeError as ArrowTypeError, + BaseExtensionType as BaseExtensionType, + BinaryArray as BinaryArray, + BinaryScalar as BinaryScalar, + BooleanArray as BooleanArray, + BooleanScalar as BooleanScalar, + Buffer as Buffer, + BufferedInputStream as BufferedInputStream, + BufferedOutputStream as BufferedOutputStream, + BufferOutputStream as BufferOutputStream, + BufferReader as BufferReader, + BuildInfo as BuildInfo, + ChunkedArray as ChunkedArray, + Codec as Codec, + CompressedInputStream as CompressedInputStream, + CompressedOutputStream as CompressedOutputStream, + DataType as DataType, + Date32Array as Date32Array, + Date32Scalar as Date32Scalar, + Date64Array as Date64Array, + Date64Scalar as Date64Scalar, + Decimal128Array as Decimal128Array, + Decimal128Scalar as Decimal128Scalar, + Decimal128Type as Decimal128Type, + Decimal256Array as Decimal256Array, + Decimal256Scalar as Decimal256Scalar, + Decimal256Type as Decimal256Type, + DenseUnionType as DenseUnionType, + DeserializationCallbackError as DeserializationCallbackError, + DictionaryArray as DictionaryArray, + DictionaryMemo as DictionaryMemo, + DictionaryScalar as DictionaryScalar, + DictionaryType as DictionaryType, + DoubleScalar as DoubleScalar, + DurationArray as DurationArray, + DurationScalar as DurationScalar, + DurationType as DurationType, + ExtensionArray as ExtensionArray, + ExtensionScalar as ExtensionScalar, + ExtensionType as ExtensionType, + Field as Field, + FixedSizeBinaryArray as FixedSizeBinaryArray, + FixedSizeBinaryScalar as FixedSizeBinaryScalar, + FixedSizeBinaryType as FixedSizeBinaryType, + FixedSizeBufferWriter as FixedSizeBufferWriter, + FixedSizeListArray as FixedSizeListArray, + FixedSizeListScalar as FixedSizeListScalar, + FixedSizeListType as FixedSizeListType, + FloatingPointArray as FloatingPointArray, + FloatScalar as FloatScalar, + HalfFloatScalar as HalfFloatScalar, + Int8Array as Int8Array, + Int8Scalar as Int8Scalar, + Int16Array as Int16Array, + Int16Scalar as Int16Scalar, + Int32Array as Int32Array, + Int32Scalar as Int32Scalar, + Int64Array as Int64Array, + Int64Scalar as Int64Scalar, + IntegerArray as IntegerArray, + KeyValueMetadata as KeyValueMetadata, + LargeBinaryArray as LargeBinaryArray, + LargeBinaryScalar as LargeBinaryScalar, + LargeListArray as LargeListArray, + LargeListScalar as LargeListScalar, + LargeListType as LargeListType, + LargeStringArray as LargeStringArray, + LargeStringScalar as LargeStringScalar, + ListArray as ListArray, + ListScalar as ListScalar, + ListType as ListType, + LoggingMemoryPool as LoggingMemoryPool, + MapArray as MapArray, + MapScalar as MapScalar, + MapType as MapType, + MemoryMappedFile as MemoryMappedFile, + MemoryPool as MemoryPool, + MockOutputStream as MockOutputStream, + MonthDayNano as MonthDayNano, + MonthDayNanoIntervalArray as MonthDayNanoIntervalArray, + MonthDayNanoIntervalScalar as MonthDayNanoIntervalScalar, + NativeFile as NativeFile, + NullArray as NullArray, + NullScalar as NullScalar, + NumericArray as NumericArray, + OSFile as OSFile, + ProxyMemoryPool as ProxyMemoryPool, + PyExtensionType as PyExtensionType, + PythonFile as PythonFile, + RecordBatch as RecordBatch, + RecordBatchReader as RecordBatchReader, + ResizableBuffer as ResizableBuffer, + RuntimeInfo as RuntimeInfo, + Scalar as Scalar, + Schema as Schema, + SerializationCallbackError as SerializationCallbackError, + SparseCOOTensor as SparseCOOTensor, + SparseCSCMatrix as SparseCSCMatrix, + SparseCSFTensor as SparseCSFTensor, + SparseCSRMatrix as SparseCSRMatrix, + SparseUnionType as SparseUnionType, + StringArray as StringArray, + StringScalar as StringScalar, + StructArray as StructArray, + StructScalar as StructScalar, + StructType as StructType, + Table as Table, + TableGroupBy as TableGroupBy, + Tensor as Tensor, + Time32Array as Time32Array, + Time32Scalar as Time32Scalar, + Time32Type as Time32Type, + Time64Array as Time64Array, + Time64Scalar as Time64Scalar, + Time64Type as Time64Type, + TimestampArray as TimestampArray, + TimestampScalar as TimestampScalar, + TimestampType as TimestampType, + TransformInputStream as TransformInputStream, + UInt8Array as UInt8Array, + UInt8Scalar as UInt8Scalar, + UInt16Array as UInt16Array, + UInt16Scalar as UInt16Scalar, + UInt32Array as UInt32Array, + UInt32Scalar as UInt32Scalar, + UInt64Array as UInt64Array, + UInt64Scalar as UInt64Scalar, + UnionArray as UnionArray, + UnionScalar as UnionScalar, + UnionType as UnionType, + UnknownExtensionType as UnknownExtensionType, + VersionInfo as VersionInfo, + allocate_buffer as allocate_buffer, + array as array, + binary as binary, + bool_ as bool_, + chunked_array as chunked_array, + compress as compress, + concat_arrays as concat_arrays, + concat_tables as concat_tables, + cpp_build_info as cpp_build_info, + cpp_version as cpp_version, + cpp_version_info as cpp_version_info, + cpu_count as cpu_count, + create_memory_map as create_memory_map, + date32 as date32, + date64 as date64, + decimal128 as decimal128, + decimal256 as decimal256, + decompress as decompress, + default_memory_pool as default_memory_pool, + dense_union as dense_union, + deserialize as deserialize, + deserialize_components as deserialize_components, + deserialize_from as deserialize_from, + dictionary as dictionary, + duration as duration, + enable_signal_handlers as enable_signal_handlers, + field as field, + float16 as float16, + float32 as float32, + float64 as float64, + foreign_buffer as foreign_buffer, + from_numpy_dtype as from_numpy_dtype, + infer_type as infer_type, + input_stream as input_stream, + int8 as int8, + int16 as int16, + int32 as int32, + int64 as int64, + io_thread_count as io_thread_count, + jemalloc_memory_pool as jemalloc_memory_pool, + jemalloc_set_decay_ms as jemalloc_set_decay_ms, + large_binary as large_binary, + large_list as large_list, + large_string as large_string, + large_utf8 as large_utf8, + list_ as list_, + log_memory_allocations as log_memory_allocations, + logging_memory_pool as logging_memory_pool, + map_ as map_, + memory_map as memory_map, + mimalloc_memory_pool as mimalloc_memory_pool, + month_day_nano_interval as month_day_nano_interval, + null as null, + nulls as nulls, + output_stream as output_stream, + proxy_memory_pool as proxy_memory_pool, + py_buffer as py_buffer, + read_serialized as read_serialized, + record_batch as record_batch, + register_extension_type as register_extension_type, + repeat as repeat, + runtime_info as runtime_info, + scalar as scalar, + schema as schema, + serialize as serialize, + serialize_to as serialize_to, + set_cpu_count as set_cpu_count, + set_io_thread_count as set_io_thread_count, + set_memory_pool as set_memory_pool, + sparse_union as sparse_union, + string as string, + struct as struct, + supported_memory_backends as supported_memory_backends, + system_memory_pool as system_memory_pool, + table as table, + time32 as time32, + time64 as time64, + timestamp as timestamp, + total_allocated_bytes as total_allocated_bytes, + transcoding_input_stream as transcoding_input_stream, + type_for_alias as type_for_alias, + uint8 as uint8, + uint16 as uint16, + uint32 as uint32, + uint64 as uint64, + unify_schemas as unify_schemas, + union as union, + unregister_extension_type as unregister_extension_type, + utf8 as utf8, +) +from pyarrow.serialization import ( + default_serialization_context as default_serialization_context, + register_default_serialization_handlers as register_default_serialization_handlers, + register_torch_serialization_handlers as register_torch_serialization_handlers, +) + +def show_versions() -> None: ... +def show_info() -> None: ... +def __getattr__(name: str) -> Any: ... +def get_include() -> str: ... +def get_libraries() -> tuple[str, str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi new file mode 100644 index 00000000000..7f5f9847d05 --- /dev/null +++ b/pyarrow-stubs/_compute.pyi @@ -0,0 +1,774 @@ +from typing import ( + Any, + ClassVar, + Literal, +) + +import pyarrow.lib + +namedtuple: function + +class ArraySortOptions(_ArraySortOptions): + def __init__( + self, + order: Literal["ascending", "descending"] = ..., + *, + null_placement: Literal["at_start", "at_end"] = ..., + ) -> None: ... + +class ArrowInvalid(ValueError, pyarrow.lib.ArrowException): ... + +class AssumeTimezoneOptions(_AssumeTimezoneOptions): + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = ..., + nonexistent: Literal["raise", "earliest", "latest"] = ..., + ) -> None: ... + +class CastOptions(_CastOptions): + def __init__( + self, + target_type: pyarrow.lib.DataType | None = ..., + *, + allow_int_overflow: bool = ..., + allow_time_truncate: bool = ..., + allow_time_overflow: bool = ..., + allow_decimal_truncate: bool = ..., + allow_float_truncate: bool = ..., + allow_invalid_utf8: bool = ..., + ) -> None: ... + @staticmethod + def safe(target_type: pyarrow.lib.DataType | None = ...) -> CastOptions: ... + @staticmethod + def unsafe(target_type: pyarrow.lib.DataType | None = ...) -> CastOptions: ... + +class CountOptions(_CountOptions): + def __init__( + self, mode: Literal["only_valid", "only_null", "all"] = ... + ) -> None: ... + +class CumulativeSumOptions(_CumulativeSumOptions): + def __init__(self, start: float, *, skip_nulls: bool = ...) -> None: ... + +class DayOfWeekOptions(_DayOfWeekOptions): + def __init__( + self, + *, + count_from_zero: bool = ..., + week_start: Literal[1, 2, 3, 4, 5, 6, 7] = ..., + ) -> None: ... + +class DictionaryEncodeOptions(_DictionaryEncodeOptions): + def __init__(self, null_encoding: Literal["mask", "encode"] = ...) -> None: ... + +class ElementWiseAggregateOptions(_ElementWiseAggregateOptions): + def __init__(self, *, skip_nulls: bool = ...) -> None: ... + +class Expression(pyarrow.lib._Weakrefable): + def __init__(self) -> None: ... + def _call( + self, unicodefunction_name, listarguments, FunctionOptionsoptions=... + ) -> Any: ... + @staticmethod + def _deserialize(buffer: pyarrow.lib.Buffer) -> Expression: ... + @staticmethod + def _field(name_or_idx: str | int) -> Expression: ... + @staticmethod + def _nested_field(self, names: list[str]) -> Expression: ... + def _scalar(self, value: pyarrow.lib.Scalar) -> Any: ... + def cast(self, type=..., safe=..., options=...) -> Any: ... + def equals(self, Expressionother) -> Any: ... + def is_null(self, boolnan_is_null=...) -> Any: ... + def is_valid(self) -> Any: ... + def isin(self, values) -> Any: ... + def __add__(self, other) -> Any: ... + def __and__(self, other) -> Any: ... + def __bool__(self) -> bool: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __invert__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __mul__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __or__(self, other) -> Any: ... + def __radd__(self, other) -> Any: ... + def __rand__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __rmul__(self, other) -> Any: ... + def __ror__(self, other) -> Any: ... + def __rsub__(self, other) -> Any: ... + def __rtruediv__(self, other) -> Any: ... + def __sub__(self, other) -> Any: ... + def __truediv__(self, other) -> Any: ... + +class ExtractRegexOptions(_ExtractRegexOptions): + def __init__(self, pattern) -> Any: ... + +class FilterOptions(_FilterOptions): + def __init__(self, null_selection_behavior=...) -> Any: ... + +class Function(pyarrow.lib._Weakrefable): + _kind_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + _doc: Any + arity: Any + kind: Any + name: Any + num_kernels: Any + def __init__(self, *args, **kwargs) -> None: ... + def call( + self, args, FunctionOptionsoptions=..., MemoryPoolmemory_pool=..., length=... + ) -> Any: ... + def __reduce__(self) -> Any: ... + +class FunctionDoc(tuple): + _asdict: ClassVar[function] = ... + _field_defaults: ClassVar[dict] = ... + _fields: ClassVar[tuple] = ... + _replace: ClassVar[function] = ... + __getnewargs__: ClassVar[function] = ... + __match_args__: ClassVar[tuple] = ... + __slots__: ClassVar[tuple] = ... + arg_names: Any + description: Any + options_class: Any + options_required: Any + summary: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _make(cls, *args, **kwargs) -> Any: ... + +class FunctionOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def deserialize(self, buf) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FunctionRegistry(pyarrow.lib._Weakrefable): + def __init__(self, *args, **kwargs) -> None: ... + def get_function(self, name) -> Any: ... + def list_functions(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class HashAggregateFunction(Function): + __pyx_vtable__: ClassVar[PyCapsule] = ... + kernels: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class HashAggregateKernel(Kernel): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class IndexOptions(_IndexOptions): + def __init__(self, value) -> Any: ... + +class JoinOptions(_JoinOptions): + def __init__(self, null_handling=..., null_replacement=...) -> Any: ... + +class Kernel(pyarrow.lib._Weakrefable): + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class MakeStructOptions(_MakeStructOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class MapLookupOptions(_MapLookupOptions): + def __init__(self, query_key, occurrence) -> Any: ... + +class MatchSubstringOptions(_MatchSubstringOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class MetaFunction(Function): + __pyx_vtable__: ClassVar[PyCapsule] = ... + kernels: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class ModeOptions(_ModeOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class NullOptions(_NullOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class PadOptions(_PadOptions): + def __init__(self, width, padding=...) -> Any: ... + +class PartitionNthOptions(_PartitionNthOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class QuantileOptions(_QuantileOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class RandomOptions(_RandomOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class RankOptions(_RankOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class ReplaceSliceOptions(_ReplaceSliceOptions): + def __init__(self, start, stop, replacement) -> Any: ... + +class ReplaceSubstringOptions(_ReplaceSubstringOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class RoundOptions(_RoundOptions): + def __init__(self, ndigits=..., round_mode=...) -> Any: ... + +class RoundTemporalOptions(_RoundTemporalOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class RoundToMultipleOptions(_RoundToMultipleOptions): + def __init__(self, multiple=..., round_mode=...) -> Any: ... + +class ScalarAggregateFunction(Function): + __pyx_vtable__: ClassVar[PyCapsule] = ... + kernels: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class ScalarAggregateKernel(Kernel): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ScalarAggregateOptions(_ScalarAggregateOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class ScalarFunction(Function): + __pyx_vtable__: ClassVar[PyCapsule] = ... + kernels: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class ScalarKernel(Kernel): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ScalarUdfContext(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + batch_length: Any + memory_pool: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class SelectKOptions(_SelectKOptions): + def __init__(self, k, sort_keys) -> Any: ... + +class SetLookupOptions(_SetLookupOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class SliceOptions(_SliceOptions): + def __init__(self, start, stop=..., step=...) -> Any: ... + +class SortOptions(_SortOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class SplitOptions(_SplitOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class SplitPatternOptions(_SplitPatternOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class StrftimeOptions(_StrftimeOptions): + def __init__(self, format=..., locale=...) -> Any: ... + +class StrptimeOptions(_StrptimeOptions): + def __init__(self, format, unit, error_is_null=...) -> Any: ... + +class StructFieldOptions(_StructFieldOptions): + def __init__(self, indices) -> Any: ... + +class TDigestOptions(_TDigestOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class TakeOptions(_TakeOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class TrimOptions(_TrimOptions): + def __init__(self, characters) -> Any: ... + +class Utf8NormalizeOptions(_Utf8NormalizeOptions): + def __init__(self, form) -> Any: ... + +class VarianceOptions(_VarianceOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class VectorFunction(Function): + __pyx_vtable__: ClassVar[PyCapsule] = ... + kernels: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class VectorKernel(Kernel): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class WeekOptions(_WeekOptions): + def __init__(self, *args, **kwargs) -> None: ... + +class _ArraySortOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, order, null_placement) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _AssumeTimezoneOptions(FunctionOptions): + _ambiguous_map: ClassVar[dict] = ... + _nonexistent_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, timezone, ambiguous, nonexistent) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _CastOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + allow_decimal_truncate: Any + allow_float_truncate: Any + allow_int_overflow: Any + allow_invalid_utf8: Any + allow_time_overflow: Any + allow_time_truncate: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options( + self, + DataTypetarget_type, + allow_int_overflow, + allow_time_truncate, + allow_time_overflow, + allow_decimal_truncate, + allow_float_truncate, + allow_invalid_utf8, + ) -> Any: ... + def _set_safe(self) -> Any: ... + def _set_type(self, target_type=...) -> Any: ... + def _set_unsafe(self) -> Any: ... + def is_safe(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _CountOptions(FunctionOptions): + _mode_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, mode) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _CumulativeSumOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, start, skip_nulls) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _DayOfWeekOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, count_from_zero, week_start) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _DictionaryEncodeOptions(FunctionOptions): + _null_encoding_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, null_encoding) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ElementWiseAggregateOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, skip_nulls) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ExtractRegexOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, pattern) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _FilterOptions(FunctionOptions): + _null_selection_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, null_selection_behavior) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _IndexOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, scalar) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _JoinOptions(FunctionOptions): + _null_handling_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, null_handling, null_replacement) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _MakeStructOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, field_names, field_nullability, field_metadata) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _MapLookupOptions(FunctionOptions): + _occurrence_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, query_key, occurrence) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _MatchSubstringOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, pattern, ignore_case) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ModeOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, n, skip_nulls, min_count) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _NullOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, nan_is_null) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _PadOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, width, padding) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _PartitionNthOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, pivot, null_placement) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _QuantileOptions(FunctionOptions): + _interp_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, quantiles, interp, skip_nulls, min_count) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RandomOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, initializer) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RankOptions(FunctionOptions): + _tiebreaker_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, sort_keys, null_placement, tiebreaker) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ReplaceSliceOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, start, stop, replacement) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ReplaceSubstringOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, pattern, replacement, max_replacements) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RoundOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, ndigits, round_mode) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RoundTemporalOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options( + self, + multiple, + unit, + week_starts_monday, + ceil_is_strictly_greater, + calendar_based_origin, + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RoundToMultipleOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, multiple, round_mode) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ScalarAggregateOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, skip_nulls, min_count) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SelectKOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, k, sort_keys) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SetLookupOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, value_set, boolskip_nulls) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SliceOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, start, stop, step) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SortOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, sort_keys, null_placement) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SplitOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, max_splits, reverse) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _SplitPatternOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, pattern, max_splits, reverse) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _StrftimeOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, format, locale) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _StrptimeOptions(FunctionOptions): + _unit_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, format, unit, error_is_null) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _StructFieldOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, indices) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _TDigestOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options( + self, quantiles, delta, buffer_size, skip_nulls, min_count + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _TakeOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, boundscheck) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _TrimOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, characters) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _Utf8NormalizeOptions(FunctionOptions): + _form_map: ClassVar[dict] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, form) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _VarianceOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options(self, ddof, skip_nulls, min_count) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _WeekOptions(FunctionOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_options( + self, week_starts_monday, count_from_zero, first_week_is_fully_in_year + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ordered_dict: + __hash__: ClassVar[None] = ... + def __init__(self, *args, **kwargs) -> None: ... + def clear(self, *args, **kwargs) -> Any: ... + def copy(self) -> ashallowcopyofD: ... + @classmethod + def fromkeys(cls, *args, **kwargs) -> Any: ... + def get(self, *args, **kwargs) -> Any: ... + def items(self, *args, **kwargs) -> Any: ... + def keys(self, *args, **kwargs) -> Any: ... + def pop(self, *args, **kwargs) -> Any: ... + def popitem(self, *args, **kwargs) -> Any: ... + def setdefault(self, *args, **kwargs) -> Any: ... + def update(self, *args, **kwargs) -> Any: ... + def values(self, *args, **kwargs) -> Any: ... + @classmethod + def __class_getitem__(cls, *args, **kwargs) -> Any: ... + def __contains__(self, other) -> Any: ... + def __delitem__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getitem__(self, y) -> Any: ... + def __gt__(self, other) -> Any: ... + def __ior__(self, other) -> Any: ... + def __iter__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __len__(self) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __or__(self, other) -> Any: ... + def __reversed__(self) -> Any: ... + def __ror__(self, other) -> Any: ... + def __setitem__(self, index, object) -> Any: ... + def __sizeof__(self) -> Any: ... + +def __pyx_unpickle_Kernel(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def _deserialize(Bufferbuffer) -> Any: ... +def _get_scalar_udf_context(memory_pool, batch_length) -> Any: ... +def _group_by(args, keys, aggregations) -> Any: ... +def _min_count_doc(*args, **kwargs) -> Any: ... +def _raise_invalid_function_option(*args, **kwargs) -> Any: ... +def _skip_nulls_doc() -> Any: ... +def call_function(name, args, options=..., memory_pool=..., length=...) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def function_registry() -> Any: ... +def get_function(name) -> Any: ... +def list_functions() -> Any: ... +def register_scalar_function( + func, function_name, function_doc, in_types, out_type +) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_compute_docstrings.pyi b/pyarrow-stubs/_compute_docstrings.pyi new file mode 100644 index 00000000000..d7c52c46cb1 --- /dev/null +++ b/pyarrow-stubs/_compute_docstrings.pyi @@ -0,0 +1,7 @@ +from typing import TypedDict + +class _FunctionDocAdditions(TypedDict): + filter: str + mode: str + +function_doc_additions: _FunctionDocAdditions diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi new file mode 100644 index 00000000000..4f05ae2f86a --- /dev/null +++ b/pyarrow-stubs/_csv.pyi @@ -0,0 +1,198 @@ +import collections.abc +from typing import ( + Any, + ClassVar, + overload, +) + +import _abc +import pyarrow.lib + +ISO8601: _ISO8601 +_stringify_path: function +namedtuple: function + +class CSVStreamingReader(pyarrow.lib.RecordBatchReader): + __pyx_vtable__: ClassVar[PyCapsule] = ... + schema: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class CSVWriter(pyarrow.lib._CRecordBatchWriter): + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ConvertOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + auto_dict_encode: Any + auto_dict_max_cardinality: Any + check_utf8: Any + column_types: Any + decimal_point: Any + false_values: Any + include_columns: Any + include_missing_columns: Any + null_values: Any + quoted_strings_can_be_null: Any + strings_can_be_null: Any + timestamp_parsers: Any + true_values: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, ConvertOptionsother) -> Any: ... + def validate(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getstate__(self) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce_cython__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class InvalidRow(_InvalidRow): + __slots__: ClassVar[tuple] = ... + +class Mapping(collections.abc.Collection): + _abc_impl: ClassVar[_abc._abc_data] = ... + get: ClassVar[function] = ... + items: ClassVar[function] = ... + keys: ClassVar[function] = ... + values: ClassVar[function] = ... + __abstractmethods__: ClassVar[frozenset] = ... + __contains__: ClassVar[function] = ... + __eq__: ClassVar[function] = ... + __getitem__: ClassVar[function] = ... + __hash__: ClassVar[None] = ... + __reversed__: ClassVar[None] = ... + __slots__: ClassVar[tuple] = ... + +class ParseOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + delimiter: Any + double_quote: Any + escape_char: Any + ignore_empty_lines: Any + invalid_row_handler: Any + newlines_in_values: Any + quote_char: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, ParseOptionsother) -> Any: ... + def validate(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getstate__(self) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce_cython__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class ReadOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + autogenerate_column_names: Any + block_size: Any + column_names: Any + encoding: encoding + skip_rows: Any + skip_rows_after_names: Any + use_threads: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, ReadOptionsother) -> Any: ... + def validate(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getstate__(self) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce_cython__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class SignalStopHandler: + stop_token: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _init_signals(self) -> Any: ... + def __enter__(self) -> Any: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class WriteOptions(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + batch_size: Any + delimiter: Any + include_header: Any + def __init__(self, *args, **kwargs) -> None: ... + def validate(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ISO8601(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __slots__: ClassVar[tuple] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _InvalidRow(tuple): + _asdict: ClassVar[function] = ... + _field_defaults: ClassVar[dict] = ... + _fields: ClassVar[tuple] = ... + _replace: ClassVar[function] = ... + __getnewargs__: ClassVar[function] = ... + __match_args__: ClassVar[tuple] = ... + __slots__: ClassVar[tuple] = ... + actual_columns: Any + expected_columns: Any + number: Any + text: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _make(cls, *args, **kwargs) -> Any: ... + +def __pyx_unpickle__ISO8601(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def open_csv( + input_file, + read_options=..., + parse_options=..., + convert_options=..., + MemoryPoolmemory_pool=..., +) -> Any: ... +@overload +def read_csv( + input_file, + read_options=..., + parse_options=..., + convert_options=..., + MemoryPoolmemory_pool=..., +) -> Any: ... +@overload +def read_csv(source) -> Any: ... +def tobytes(o) -> Any: ... +def write_csv( + data, output_file, write_options=..., MemoryPoolmemory_pool=... +) -> Any: ... diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi new file mode 100644 index 00000000000..ae6f0664653 --- /dev/null +++ b/pyarrow-stubs/_dataset.pyi @@ -0,0 +1,398 @@ +import importlib._bootstrap +from typing import ( + Any, + ClassVar, + overload, +) + +import pyarrow.lib + +_DEFAULT_BATCH_READAHEAD: int +_DEFAULT_BATCH_SIZE: int +_DEFAULT_FRAGMENT_READAHEAD: int +_dataset_pq: bool +_is_iterable: function +_is_path_like: function +_orc_fileformat: None +_orc_imported: bool +_stringify_path: function + +class ArrowTypeError(TypeError, pyarrow.lib.ArrowException): ... + +class CsvFileFormat(FileFormat): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + _read_options_py: _read_options_py + parse_options: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, CsvFileFormatother) -> Any: ... + def make_write_options(self, **kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + +class CsvFileWriteOptions(FileWriteOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + write_options: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + convert_options: Any + read_options: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, CsvFragmentScanOptionsother) -> Any: ... + def __reduce__(self) -> Any: ... + +class Dataset(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + partition_expression: Any + schema: Any + def __init__(self, *args, **kwargs) -> None: ... + def count_rows(self, **kwargs) -> Any: ... + def get_fragments(self, Expressionfilter=...) -> Any: ... + def head(self, intnum_rows, **kwargs) -> Any: ... + def join( + self, + right_dataset, + keys, + right_keys=..., + join_type=..., + left_suffix=..., + right_suffix=..., + coalesce_keys=..., + use_threads=..., + ) -> Any: ... + def replace_schema(self, Schemaschema) -> Any: ... + @overload + def scanner(self, **kwargs) -> Any: ... + @overload + def scanner(self, columns=...) -> Any: ... + @overload + def scanner(self, columns=...) -> Any: ... + @overload + def scanner(self, filter=...) -> Any: ... + def take(self, indices, **kwargs) -> Any: ... + def to_batches(self, **kwargs) -> Any: ... + def to_table(self, **kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class DatasetFactory(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + root_partition: Any + def __init__(self, *args, **kwargs) -> None: ... + def finish(self, Schemaschema=...) -> Any: ... + def inspect(self) -> Any: ... + def inspect_schemas(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class DirectoryPartitioning(KeyValuePartitioning): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def discover( + self, + field_names=..., + infer_dictionary=..., + max_partition_dictionary_size=..., + schema=..., + segment_encoding=..., + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FeatherFileFormat(IpcFileFormat): + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_extname: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class FileFormat(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_extname: Any + default_fragment_scan_options: Any + def __init__(self, *args, **kwargs) -> None: ... + def inspect(self, file, filesystem=...) -> Any: ... + def make_fragment( + self, file, filesystem=..., Expressionpartition_expression=... + ) -> Any: ... + def make_write_options(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileFragment(Fragment): + __pyx_vtable__: ClassVar[PyCapsule] = ... + buffer: Any + filesystem: Any + format: Any + path: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def open(self) -> Any: ... + def __reduce__(self) -> Any: ... + +class FileSystemDataset(Dataset): + __pyx_vtable__: ClassVar[PyCapsule] = ... + files: Any + filesystem: Any + format: Any + partitioning: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def from_paths( + cls, + typecls, + paths, + schema=..., + format=..., + filesystem=..., + partitions=..., + root_partition=..., + ) -> Any: ... + def __reduce__(self) -> Any: ... + +class FileSystemDatasetFactory(DatasetFactory): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileSystemFactoryOptions(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + exclude_invalid_files: Any + partition_base_dir: Any + partitioning: Any + partitioning_factory: Any + selector_ignore_prefixes: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileWriteOptions(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + format: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FilenamePartitioning(KeyValuePartitioning): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def discover( + self, field_names=..., infer_dictionary=..., schema=..., segment_encoding=... + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Fragment(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + partition_expression: Any + physical_schema: Any + def __init__(self, *args, **kwargs) -> None: ... + def count_rows(self, **kwargs) -> Any: ... + def head(self, intnum_rows, **kwargs) -> Any: ... + def scanner(self, Schemaschema=..., **kwargs) -> Any: ... + def take(self, indices, **kwargs) -> Any: ... + def to_batches(self, Schemaschema=..., **kwargs) -> Any: ... + def to_table(self, Schemaschema=..., **kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FragmentScanOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + type_name: Any + def __init__(self, *args, **kwargs) -> None: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class HivePartitioning(KeyValuePartitioning): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def discover( + self, + infer_dictionary=..., + max_partition_dictionary_size=..., + null_fallback=..., + schema=..., + segment_encoding=..., + ) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class InMemoryDataset(Dataset): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class IpcFileFormat(FileFormat): + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_extname: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, IpcFileFormatother) -> Any: ... + def __reduce__(self) -> Any: ... + +class IpcFileWriteOptions(FileWriteOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class KeyValuePartitioning(Partitioning): + __pyx_vtable__: ClassVar[PyCapsule] = ... + dictionaries: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Partitioning(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + schema: Any + def __init__(self, *args, **kwargs) -> None: ... + def parse(self, path) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class PartitioningFactory(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + type_name: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class RecordBatchIterator(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __iter__(self) -> Any: ... + def __next__(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Scanner(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + dataset_schema: Any + projected_schema: Any + def __init__(self, *args, **kwargs) -> None: ... + def count_rows(self) -> Any: ... + def from_batches( + self, + source, + Schemaschema=..., + booluse_threads=..., + use_async=..., + MemoryPoolmemory_pool=..., + columns=..., + Expressionfilter=..., + intbatch_size=..., + FragmentScanOptionsfragment_scan_options=..., + ) -> Any: ... + def from_dataset( + self, + Datasetdataset, + booluse_threads=..., + use_async=..., + MemoryPoolmemory_pool=..., + columns=..., + Expressionfilter=..., + intbatch_size=..., + intbatch_readahead=..., + intfragment_readahead=..., + FragmentScanOptionsfragment_scan_options=..., + ) -> Any: ... + def from_fragment( + self, + Fragmentfragment, + Schemaschema=..., + booluse_threads=..., + use_async=..., + MemoryPoolmemory_pool=..., + columns=..., + Expressionfilter=..., + intbatch_size=..., + intbatch_readahead=..., + FragmentScanOptionsfragment_scan_options=..., + ) -> Any: ... + def head(self, intnum_rows) -> Any: ... + def scan_batches(self) -> Any: ... + @overload + def take(self, indices) -> Any: ... + @overload + def take(self, indices) -> Any: ... + def to_batches(self) -> Any: ... + def to_reader(self) -> Any: ... + def to_table(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class TaggedRecordBatch(importlib._bootstrap.TaggedRecordBatch): ... + +class TaggedRecordBatchIterator(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __iter__(self) -> Any: ... + def __next__(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class UnionDataset(Dataset): + __pyx_vtable__: ClassVar[PyCapsule] = ... + children: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + +class UnionDatasetFactory(DatasetFactory): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class WrittenFile(pyarrow.lib._Weakrefable): + metadata: metadata + path: path + size: size + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def __pyx_unpickle_WrittenFile(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def _filesystemdataset_write( + Scannerdata, + base_dir, + unicodebasename_template, + FileSystemfilesystem, + Partitioningpartitioning, + FileWriteOptionsfile_options, + intmax_partitions, + file_visitor, + unicodeexisting_data_behavior, + intmax_open_files, + intmax_rows_per_file, + intmin_rows_per_group, + intmax_rows_per_group, + boolcreate_dir, +) -> Any: ... +def _forbid_instantiation(klass, subclasses_instead=...) -> Any: ... +def _get_orc_fileformat() -> Any: ... +def _get_parquet_classes() -> Any: ... +def _get_parquet_symbol(name) -> Any: ... +def _get_partition_keys(Expressionpartition_expression) -> Any: ... +def _pc() -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi new file mode 100644 index 00000000000..d4b97475f12 --- /dev/null +++ b/pyarrow-stubs/_dataset_orc.pyi @@ -0,0 +1,13 @@ +from typing import ( + Any, + ClassVar, +) + +import pyarrow._dataset + +class OrcFileFormat(pyarrow._dataset.FileFormat): + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_extname: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, OrcFileFormatother) -> Any: ... + def __reduce__(self) -> Any: ... diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi new file mode 100644 index 00000000000..746680d4cc0 --- /dev/null +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -0,0 +1,110 @@ +from typing import ( + Any, + ClassVar, +) + +import pyarrow._dataset +import pyarrow.lib + +_is_path_like: function +_stringify_path: function + +class ParquetDatasetFactory(pyarrow._dataset.DatasetFactory): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ParquetFactoryOptions(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + partition_base_dir: Any + partitioning: Any + partitioning_factory: Any + validate_column_chunk_paths: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ParquetFileFormat(pyarrow._dataset.FileFormat): + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_extname: Any + read_options: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, ParquetFileFormatother) -> Any: ... + def make_fragment( + self, file, filesystem=..., Expressionpartition_expression=..., row_groups=... + ) -> Any: ... + def make_write_options(self, **kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + +class ParquetFileFragment(pyarrow._dataset.FileFragment): + __pyx_vtable__: ClassVar[PyCapsule] = ... + metadata: Any + num_row_groups: Any + row_groups: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def ensure_complete_metadata(self) -> Any: ... + def split_by_row_group(self, Expressionfilter=..., Schemaschema=...) -> Any: ... + def subset( + self, Expressionfilter=..., Schemaschema=..., row_group_ids=... + ) -> Any: ... + def __reduce__(self) -> Any: ... + +class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _set_arrow_properties(self) -> Any: ... + def _set_properties(self) -> Any: ... + def update(self, **kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ParquetFragmentScanOptions(pyarrow._dataset.FragmentScanOptions): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + buffer_size: Any + pre_buffer: Any + thrift_container_size_limit: Any + thrift_string_size_limit: Any + use_buffered_stream: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _reconstruct(cls, typecls, kwargs) -> Any: ... + def equals(self, ParquetFragmentScanOptionsother) -> Any: ... + def __reduce__(self) -> Any: ... + +class ParquetReadOptions(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + _coerce_int96_timestamp_unit: _coerce_int96_timestamp_unit + coerce_int96_timestamp_unit: Any + dictionary_columns: dictionary_columns + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, ParquetReadOptionsother) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class RowGroupInfo: + __hash__: ClassVar[None] = ... + def __init__(self, id, metadata, schema) -> Any: ... + def __eq__(self, other) -> Any: ... + @property + def num_rows(self) -> Any: ... + @property + def statistics(self) -> Any: ... + @property + def total_byte_size(self) -> Any: ... + +def __pyx_unpickle_ParquetReadOptions( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_exec_plan.pyi b/pyarrow-stubs/_exec_plan.pyi new file mode 100644 index 00000000000..035040ebd10 --- /dev/null +++ b/pyarrow-stubs/_exec_plan.pyi @@ -0,0 +1,27 @@ +from typing import ( + Any, + ClassVar, +) + +import pyarrow._dataset + +class InMemoryDataset(pyarrow._dataset.Dataset): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def _filter_table(table, expression, output_type=...) -> Any: ... +def _perform_join( + join_type, + left_operand, + left_keys, + right_operand, + right_keys, + left_suffix=..., + right_suffix=..., + use_threads=..., + coalesce_keys=..., + output_type=..., +) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_feather.pyi b/pyarrow-stubs/_feather.pyi new file mode 100644 index 00000000000..9428e1ff9fa --- /dev/null +++ b/pyarrow-stubs/_feather.pyi @@ -0,0 +1,20 @@ +from typing import Any + +import pyarrow.lib + +class FeatherError(Exception): ... + +class FeatherReader(pyarrow.lib._Weakrefable): + version: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def read(self) -> Any: ... + def read_indices(self, indices) -> Any: ... + def read_names(self, names) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def tobytes(o) -> Any: ... +def write_feather( + Tabletable, dest, compression=..., compression_level=..., chunksize=..., version=... +) -> Any: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi new file mode 100644 index 00000000000..05aed6f47ff --- /dev/null +++ b/pyarrow-stubs/_flight.pyi @@ -0,0 +1,726 @@ +import enum +import importlib._bootstrap +import re +from typing import ( + Any, + ClassVar, +) + +import pyarrow.lib + +_FLIGHT_SERVER_ERROR_REGEX: re.Pattern +_get_legacy_format_default: function + +class Action(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + body: Any + type: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ActionType(importlib._bootstrap._ActionType): + def make_action(self, buf) -> Any: ... + +class ArrowCancelled(pyarrow.lib.ArrowException): + def __init__(self, message, signum=...) -> Any: ... + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, pyarrow.lib.ArrowException): ... + +class BasicAuth(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + password: Any + username: Any + def __init__(self, *args, **kwargs) -> None: ... + def deserialize(self, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class CallInfo(importlib._bootstrap._CallInfo): ... +class CertKeyPair(importlib._bootstrap._CertKeyPair): ... + +class ClientAuthHandler(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def authenticate(self, outgoing, incoming) -> Any: ... + def get_token(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ClientAuthReader(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def read(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ClientAuthSender(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def write(self, message) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ClientMiddleware(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def call_completed(self, exception) -> Any: ... + def received_headers(self, headers) -> Any: ... + def sending_headers(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ClientMiddlewareFactory(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def start_call(self, info) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class DescriptorType(enum.Enum): + class _member_type_: + __class__: Any + def __init__(self, *args, **kwargs) -> None: ... + def __delattr__(self, name) -> Any: ... + def __dir__(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __format__(self, *args, **kwargs) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + @classmethod + def __init_subclass__(cls, *args, **kwargs) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __reduce_ex__(self, protocol) -> Any: ... + def __setattr__(self, name, value) -> Any: ... + def __sizeof__(self) -> Any: ... + @classmethod + def __subclasshook__(cls, *args, **kwargs) -> Any: ... + __new__: ClassVar[function] = ... + CMD: ClassVar[DescriptorType] = ... + PATH: ClassVar[DescriptorType] = ... + UNKNOWN: ClassVar[DescriptorType] = ... + _generate_next_value_: ClassVar[function] = ... + _member_map_: ClassVar[dict] = ... + _member_names_: ClassVar[list] = ... + _value2member_map_: ClassVar[dict] = ... + +class FlightCallOptions(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightCancelledError(FlightError, pyarrow.lib.ArrowCancelled): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightClient(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def authenticate( + self, auth_handler, FlightCallOptionsoptions: FlightCallOptions = ... + ) -> Any: ... + def authenticate_basic_token( + self, username, password, FlightCallOptionsoptions: FlightCallOptions = ... + ) -> Any: ... + def close(self) -> Any: ... + @classmethod + def connect( + cls, + typecls, + location, + tls_root_certs=..., + cert_chain=..., + private_key=..., + override_hostname=..., + disable_server_verification=..., + ) -> Any: ... + def do_action( + self, action, FlightCallOptionsoptions: FlightCallOptions = ... + ) -> Any: ... + def do_exchange( + self, + FlightDescriptordescriptor: FlightDescriptor, + FlightCallOptionsoptions: FlightCallOptions = ..., + ) -> Any: ... + def do_get( + self, Ticketticket: Ticket, FlightCallOptionsoptions: FlightCallOptions = ... + ) -> Any: ... + def do_put( + self, + FlightDescriptordescriptor: FlightDescriptor, + Schemaschema: Schema, + FlightCallOptionsoptions: FlightCallOptions = ..., + ) -> Any: ... + def get_flight_info( + self, + FlightDescriptordescriptor: FlightDescriptor, + FlightCallOptionsoptions: FlightCallOptions = ..., + ) -> Any: ... + def get_schema( + self, + FlightDescriptordescriptor: FlightDescriptor, + FlightCallOptionsoptions: FlightCallOptions = ..., + ) -> Any: ... + def list_actions( + self, FlightCallOptionsoptions: FlightCallOptions = ... + ) -> Any: ... + def list_flights( + self, + bytescriteria: bytes = ..., + FlightCallOptionsoptions: FlightCallOptions = ..., + ) -> Any: ... + def wait_for_available(self, timeout=...) -> Any: ... + def __del__(self) -> Any: ... + def __enter__(self) -> Any: ... + def __exit__(self, exc_type, exc_value, traceback) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightDataStream(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightDescriptor(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + command: Any + descriptor_type: Any + path: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def for_command(self, command) -> Any: ... + def for_path(self, *path) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightEndpoint(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + locations: Any + ticket: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightError(Exception): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightInfo(pyarrow.lib._Weakrefable): + descriptor: Any + endpoints: Any + schema: Any + total_bytes: Any + total_records: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightInternalError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightMetadataReader(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def read(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightMetadataWriter(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def write(self, message) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightMethod(enum.Enum): + class _member_type_: + __class__: Any + def __init__(self, *args, **kwargs) -> None: ... + def __delattr__(self, name) -> Any: ... + def __dir__(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __format__(self, *args, **kwargs) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + @classmethod + def __init_subclass__(cls, *args, **kwargs) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __reduce_ex__(self, protocol) -> Any: ... + def __setattr__(self, name, value) -> Any: ... + def __sizeof__(self) -> Any: ... + @classmethod + def __subclasshook__(cls, *args, **kwargs) -> Any: ... + __new__: ClassVar[function] = ... + DO_ACTION: ClassVar[FlightMethod] = ... + DO_EXCHANGE: ClassVar[FlightMethod] = ... + DO_GET: ClassVar[FlightMethod] = ... + DO_PUT: ClassVar[FlightMethod] = ... + GET_FLIGHT_INFO: ClassVar[FlightMethod] = ... + GET_SCHEMA: ClassVar[FlightMethod] = ... + HANDSHAKE: ClassVar[FlightMethod] = ... + INVALID: ClassVar[FlightMethod] = ... + LIST_ACTIONS: ClassVar[FlightMethod] = ... + LIST_FLIGHTS: ClassVar[FlightMethod] = ... + _generate_next_value_: ClassVar[function] = ... + _member_map_: ClassVar[dict] = ... + _member_names_: ClassVar[list] = ... + _value2member_map_: ClassVar[dict] = ... + +class FlightServerBase(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + port: Any + def __init__(self, *args, **kwargs) -> None: ... + def do_action(self, context, action) -> Any: ... + def do_exchange(self, context, descriptor, reader, writer) -> Any: ... + def do_get(self, context, ticket) -> Any: ... + def do_put( + self, + context, + descriptor, + MetadataRecordBatchReaderreader: MetadataRecordBatchReader, + FlightMetadataWriterwriter: FlightMetadataWriter, + ) -> Any: ... + def get_flight_info(self, context, descriptor) -> Any: ... + def get_schema(self, context, descriptor) -> Any: ... + def list_actions(self, context) -> Any: ... + def list_flights(self, context, criteria) -> Any: ... + def run(self) -> Any: ... + def serve(self) -> Any: ... + def shutdown(self) -> Any: ... + def wait(self) -> Any: ... + def __enter__(self) -> Any: ... + def __exit__(self, exc_type, exc_value, traceback) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightServerError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightStreamChunk(pyarrow.lib._Weakrefable): + app_metadata: Any + data: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __iter__(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightStreamReader(MetadataRecordBatchReader): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def cancel(self) -> Any: ... + def read_all(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightStreamWriter(MetadataRecordBatchWriter): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def done_writing(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FlightTimedOutError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightUnauthenticatedError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightUnauthorizedError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightUnavailableError(FlightError, pyarrow.lib.ArrowException): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce_cython__(self) -> Any: ... + def __setstate_cython__(self, __pyx_state) -> Any: ... + +class FlightWriteSizeExceededError(pyarrow.lib.ArrowInvalid): + def __init__(self, message, limit, actual) -> Any: ... + +class GeneratorStream(FlightDataStream): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Location(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + uri: Any + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, Locationother) -> Any: ... + def for_grpc_tcp(self, host, port) -> Any: ... + def for_grpc_tls(self, host, port) -> Any: ... + def for_grpc_unix(self, path) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class MetadataRecordBatchWriter(pyarrow.lib._CRecordBatchWriter): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def begin(self, Schemaschema: Schema, options=...) -> Any: ... + def close(self) -> Any: ... + def write_batch(self, RecordBatchbatch) -> Any: ... + def write_metadata(self, buf) -> Any: ... + def write_table(self, Tabletable, max_chunksize=..., **kwargs) -> Any: ... + def write_with_metadata(self, RecordBatchbatch, buf) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class RecordBatchStream(FlightDataStream): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Result(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + body: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class SchemaResult(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + schema: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerAuthHandler(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def authenticate(self, outgoing, incoming) -> Any: ... + def is_valid(self, token) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerAuthReader(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def read(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerAuthSender(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def write(self, message) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerCallContext(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def get_middleware(self, key) -> Any: ... + def is_cancelled(self) -> Any: ... + def peer(self) -> Any: ... + def peer_identity(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerMiddleware(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def call_completed(self, exception) -> Any: ... + def sending_headers(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ServerMiddlewareFactory(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def start_call(self, info, headers) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class SignalStopHandler: + stop_token: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _init_signals(self) -> Any: ... + def __enter__(self) -> Any: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class Ticket(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + ticket: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def deserialize(cls, typecls, serialized) -> Any: ... + def serialize(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class TracingServerMiddleware(ServerMiddleware): + __slots__: ClassVar[list] = ... + trace_context: Any + def __init__(self, trace_context) -> Any: ... + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ActionType(tuple): + _asdict: ClassVar[function] = ... + _field_defaults: ClassVar[dict] = ... + _fields: ClassVar[tuple] = ... + _replace: ClassVar[function] = ... + __getnewargs__: ClassVar[function] = ... + __match_args__: ClassVar[tuple] = ... + __slots__: ClassVar[tuple] = ... + description: Any + type: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _make(cls, *args, **kwargs) -> Any: ... + +class _CallInfo(tuple): + _asdict: ClassVar[function] = ... + _field_defaults: ClassVar[dict] = ... + _fields: ClassVar[tuple] = ... + _replace: ClassVar[function] = ... + __getnewargs__: ClassVar[function] = ... + __match_args__: ClassVar[tuple] = ... + __slots__: ClassVar[tuple] = ... + method: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _make(cls, *args, **kwargs) -> Any: ... + +class _CertKeyPair(tuple): + _asdict: ClassVar[function] = ... + _field_defaults: ClassVar[dict] = ... + _fields: ClassVar[tuple] = ... + _replace: ClassVar[function] = ... + __getnewargs__: ClassVar[function] = ... + __match_args__: ClassVar[tuple] = ... + __slots__: ClassVar[tuple] = ... + cert: Any + key: Any + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _make(cls, *args, **kwargs) -> Any: ... + +class _FlightServerFinalizer(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def finalize(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _MetadataRecordBatchReader( + pyarrow.lib._Weakrefable, pyarrow.lib._ReadPandasMixin +): + schema: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def read_all(self) -> Any: ... + def read_chunk(self) -> Any: ... + def to_reader(self) -> Any: ... + def __iter__(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> Any: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + def __init__(self, *args, **kwargs) -> None: ... + def start_call(self, info, headers) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def call_completed(self, exception) -> Any: ... + def sending_headers(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def __pyx_unpickle_ClientAuthHandler( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_ClientMiddleware( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_ClientMiddlewareFactory( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightCancelledError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightDataStream( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightInternalError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightServerError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightTimedOutError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightUnauthenticatedError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightUnauthorizedError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_FlightUnavailableError( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_ServerAuthHandler( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_ServerMiddleware( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_ServerMiddlewareFactory( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle_TracingServerMiddlewareFactory( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle__ServerMiddlewareFactoryWrapper( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle__ServerMiddlewareWrapper( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def _munge_grpc_python_error(message) -> Any: ... +def as_buffer(o) -> Any: ... +def connect(location, **kwargs) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi new file mode 100644 index 00000000000..9dfd706d6eb --- /dev/null +++ b/pyarrow-stubs/_fs.pyi @@ -0,0 +1,262 @@ +import abc +import datetime +import enum +import importlib._bootstrap +from typing import ( + Any, + ClassVar, + overload, +) + +import _abc +import pyarrow.lib + +Directory: importlib._bootstrap.FileType +File: importlib._bootstrap.FileType +NotFound: importlib._bootstrap.FileType +Unknown: importlib._bootstrap.FileType +_stringify_path: function +abstractmethod: function + +class ABC: + _abc_impl: ClassVar[_abc._abc_data] = ... + __abstractmethods__: ClassVar[frozenset] = ... + __slots__: ClassVar[tuple] = ... + +class FileInfo(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + base_name: Any + extension: Any + is_file: Any + mtime: Any + mtime_ns: Any + path: Any + size: Any + type: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileSelector(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + allow_not_found: Any + base_dir: Any + recursive: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileSystem(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + type_name: Any + def __init__(self, *args, **kwargs) -> None: ... + def _wrap_input_stream(self, stream, path, compression, buffer_size) -> Any: ... + def _wrap_output_stream(self, stream, path, compression, buffer_size) -> Any: ... + def copy_file(self, src, dest) -> Any: ... + def create_dir(self, *args, **kwargs) -> Any: ... + def delete_dir(self, path) -> Any: ... + def delete_dir_contents(self, *args, **kwargs) -> Any: ... + def delete_file(self, path) -> Any: ... + def equals(self, FileSystemother) -> Any: ... + @overload + def from_uri(self, uri) -> Any: ... + @overload + def from_uri(self, uri) -> Any: ... + def get_file_info(self, paths_or_selector) -> Any: ... + def move(self, src, dest) -> Any: ... + def normalize_path(self, path) -> Any: ... + @overload + def open_append_stream( + self, path, compression=..., buffer_size=..., metadata=... + ) -> Any: ... + @overload + def open_append_stream(self, path) -> Any: ... + @overload + def open_input_file(self, path) -> Any: ... + @overload + def open_input_file(self) -> Any: ... + @overload + def open_input_file(self, path) -> Any: ... + @overload + def open_input_stream(self, path, compression=..., buffer_size=...) -> Any: ... + @overload + def open_input_stream(self) -> Any: ... + @overload + def open_input_stream(self, path) -> Any: ... + @overload + def open_output_stream( + self, path, compression=..., buffer_size=..., metadata=... + ) -> Any: ... + @overload + def open_output_stream(self, path) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileSystemHandler(abc.ABC): + def copy_file(self, src: str, dest: str) -> None: ... + def create_dir(self, path: str, recursive: bool) -> None: ... + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool = ...) -> None: ... + def delete_file(self, path: str) -> None: ... + def delete_root_dir_contents(self) -> None: ... + def get_file_info(self, paths: list[str]) -> list[FileInfo]: ... + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + def get_type_name(self) -> Any: ... + def move(self, src, dest) -> Any: ... + def normalize_path(self, path) -> Any: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + def open_input_file(self, path: str) -> Any: ... + def open_input_stream(self, path: str) -> Any: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + +class FileType(enum.IntEnum): + class _member_type_: + denominator: Any + imag: Any + numerator: Any + real: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + @overload + def as_integer_ratio(self) -> Any: ... + @overload + def as_integer_ratio(self) -> Any: ... + @overload + def as_integer_ratio(self) -> Any: ... + def bit_count(self) -> Any: ... + def bit_length(self) -> Any: ... + def conjugate(self, *args, **kwargs) -> Any: ... + @classmethod + def from_bytes(cls, *args, **kwargs) -> Any: ... + def to_bytes(self, *args, **kwargs) -> Any: ... + def __abs__(self) -> Any: ... + def __add__(self, other) -> Any: ... + def __and__(self, other) -> Any: ... + def __bool__(self) -> Any: ... + def __ceil__(self, *args, **kwargs) -> Any: ... + def __divmod__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __float__(self) -> Any: ... + def __floor__(self, *args, **kwargs) -> Any: ... + def __floordiv__(self, other) -> Any: ... + def __format__(self, *args, **kwargs) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getnewargs__(self, *args, **kwargs) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __index__(self) -> Any: ... + def __int__(self) -> Any: ... + def __invert__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lshift__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __mod__(self, other) -> Any: ... + def __mul__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __neg__(self) -> Any: ... + def __or__(self, other) -> Any: ... + def __pos__(self) -> Any: ... + def __pow__(self, other) -> Any: ... + def __radd__(self, other) -> Any: ... + def __rand__(self, other) -> Any: ... + def __rdivmod__(self, other) -> Any: ... + def __rfloordiv__(self, other) -> Any: ... + def __rlshift__(self, other) -> Any: ... + def __rmod__(self, other) -> Any: ... + def __rmul__(self, other) -> Any: ... + def __ror__(self, other) -> Any: ... + def __round__(self) -> Any: ... + def __rpow__(self, other) -> Any: ... + def __rrshift__(self, other) -> Any: ... + def __rshift__(self, other) -> Any: ... + def __rsub__(self, other) -> Any: ... + def __rtruediv__(self, other) -> Any: ... + def __rxor__(self, other) -> Any: ... + def __sizeof__(self) -> Any: ... + def __sub__(self, other) -> Any: ... + def __truediv__(self, other) -> Any: ... + def __trunc__(self) -> Any: ... + def __xor__(self, other) -> Any: ... + __new__: ClassVar[function] = ... + Directory: ClassVar[importlib._bootstrap.FileType] = ... + File: ClassVar[importlib._bootstrap.FileType] = ... + NotFound: ClassVar[importlib._bootstrap.FileType] = ... + Unknown: ClassVar[importlib._bootstrap.FileType] = ... + _generate_next_value_: ClassVar[function] = ... + _member_map_: ClassVar[dict] = ... + _member_names_: ClassVar[list] = ... + _value2member_map_: ClassVar[dict] = ... + +class LocalFileSystem(FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _reconstruct(cls, typecls, kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + +class PyFileSystem(FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + handler: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + +class SubTreeFileSystem(FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + base_fs: Any + base_path: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + +class _MockFileSystem(FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class timezone(datetime.tzinfo): + max: ClassVar[datetime.timezone] = ... + min: ClassVar[datetime.timezone] = ... + utc: ClassVar[datetime.timezone] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def dst(self, *args, **kwargs) -> Any: ... + def fromutc(self, *args, **kwargs) -> Any: ... + def tzname(self, *args, **kwargs) -> Any: ... + def utcoffset(self, *args, **kwargs) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getinitargs__(self) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + +def __pyx_unpickle___Pyx_EnumMeta(*args, **kwargs) -> Any: ... +def _copy_files( + FileSystemsource_fs, + unicodesource_path, + FileSystemdestination_fs, + unicodedestination_path, + int64_tchunk_size, + booluse_threads, +) -> Any: ... +def _copy_files_selector( + FileSystemsource_fs, + FileSelectorsource_sel, + FileSystemdestination_fs, + unicodedestination_base_dir, + int64_tchunk_size, + booluse_threads, +) -> Any: ... +def _detect_compression(path) -> Any: ... +def _file_type_to_string(ty) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi new file mode 100644 index 00000000000..8aa85b089bf --- /dev/null +++ b/pyarrow-stubs/_gcsfs.pyi @@ -0,0 +1,155 @@ +import collections.abc +import datetime +from typing import ( + Any, + ClassVar, +) + +import pyarrow._fs +import pyarrow.lib + +class GcsFileSystem(pyarrow._fs.FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + default_bucket_location: Any + def __init__(self, *args, **kwargs) -> None: ... + def _expiration_datetime_from_options(self) -> Any: ... + @classmethod + def _reconstruct(cls, typecls, kwargs) -> Any: ... + def __reduce__(self) -> Any: ... + +class KeyValueMetadata(pyarrow.lib._Metadata, collections.abc.Mapping): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def equals(self, KeyValueMetadataother) -> Any: ... + def get_all(self, key) -> Any: ... + def items(self) -> Any: ... + def key(self, i) -> Any: ... + def keys(self) -> Any: ... + def to_dict(self) -> Any: ... + def value(self, i) -> Any: ... + def values(self) -> Any: ... + def __contains__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getitem__(self, index) -> Any: ... + def __gt__(self, other) -> Any: ... + def __iter__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __len__(self) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + +class datetime(datetime.date): + max: ClassVar[datetime.datetime] = ... + min: ClassVar[datetime.datetime] = ... + resolution: ClassVar[datetime.timedelta] = ... + fold: Any + hour: Any + microsecond: Any + minute: Any + second: Any + tzinfo: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def astimezone(self, *args, **kwargs) -> Any: ... + @classmethod + def combine(cls, *args, **kwargs) -> Any: ... + def ctime(self) -> Any: ... + def date(self, *args, **kwargs) -> Any: ... + def dst(self) -> Any: ... + @classmethod + def fromisoformat(cls, *args, **kwargs) -> Any: ... + @classmethod + def fromtimestamp(cls, *args, **kwargs) -> Any: ... + def isoformat(self, *args, **kwargs) -> Any: ... + @classmethod + def now(cls, *args, **kwargs) -> Any: ... + def replace(self, *args, **kwargs) -> Any: ... + @classmethod + def strptime(cls, *args, **kwargs) -> Any: ... + def time(self, *args, **kwargs) -> Any: ... + def timestamp(self, *args, **kwargs) -> Any: ... + def timetuple(self, *args, **kwargs) -> Any: ... + def timetz(self, *args, **kwargs) -> Any: ... + def tzname(self) -> Any: ... + @classmethod + def utcfromtimestamp(cls, *args, **kwargs) -> Any: ... + @classmethod + def utcnow(cls, *args, **kwargs) -> Any: ... + def utcoffset(self) -> Any: ... + def utctimetuple(self, *args, **kwargs) -> Any: ... + def __add__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __radd__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __reduce_ex__(self, protocol) -> Any: ... + def __rsub__(self, other) -> Any: ... + def __sub__(self, other) -> Any: ... + +class timedelta: + max: ClassVar[datetime.timedelta] = ... + min: ClassVar[datetime.timedelta] = ... + resolution: ClassVar[datetime.timedelta] = ... + days: Any + microseconds: Any + seconds: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def total_seconds(self, *args, **kwargs) -> Any: ... + def __abs__(self) -> Any: ... + def __add__(self, other) -> Any: ... + def __bool__(self) -> Any: ... + def __divmod__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __floordiv__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __mod__(self, other) -> Any: ... + def __mul__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __neg__(self) -> Any: ... + def __pos__(self) -> Any: ... + def __radd__(self, other) -> Any: ... + def __rdivmod__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __rfloordiv__(self, other) -> Any: ... + def __rmod__(self, other) -> Any: ... + def __rmul__(self, other) -> Any: ... + def __rsub__(self, other) -> Any: ... + def __rtruediv__(self, other) -> Any: ... + def __sub__(self, other) -> Any: ... + def __truediv__(self, other) -> Any: ... + +class timezone(datetime.tzinfo): + max: ClassVar[datetime.timezone] = ... + min: ClassVar[datetime.timezone] = ... + utc: ClassVar[datetime.timezone] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def dst(self, *args, **kwargs) -> Any: ... + def fromutc(self, *args, **kwargs) -> Any: ... + def tzname(self, *args, **kwargs) -> Any: ... + def utcoffset(self, *args, **kwargs) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getinitargs__(self) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + +def ensure_metadata(meta, boolallow_none=...) -> KeyValueMetadata: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_generated_version.pyi b/pyarrow-stubs/_generated_version.pyi new file mode 100644 index 00000000000..2e057ab7ef4 --- /dev/null +++ b/pyarrow-stubs/_generated_version.pyi @@ -0,0 +1,5 @@ +from _typeshed import Incomplete + +version: str +__version_tuple__: Incomplete +version_tuple: Incomplete diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi new file mode 100644 index 00000000000..1d6fe663269 --- /dev/null +++ b/pyarrow-stubs/_hdfs.pyi @@ -0,0 +1,19 @@ +from typing import ( + Any, + ClassVar, +) + +import pyarrow._fs + +_stringify_path: function + +class HadoopFileSystem(pyarrow._fs.FileSystem): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _reconstruct(cls, typecls, kwargs) -> Any: ... + def from_uri(self, uri) -> Any: ... + def __reduce__(self) -> Any: ... + +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_hdfsio.pyi b/pyarrow-stubs/_hdfsio.pyi new file mode 100644 index 00000000000..c42a083282f --- /dev/null +++ b/pyarrow-stubs/_hdfsio.pyi @@ -0,0 +1,70 @@ +import re +from typing import ( + Any, + Literal, + overload, +) + +import pyarrow.lib + +_HDFS_PATH_RE: re.Pattern + +class HadoopFileSystem(pyarrow.lib._Weakrefable): + extra_conf: dict + host: Any + is_open: bool + kerb_ticket: Any + port: int + user: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def chmod(self, path: str, mode: int) -> Any: ... + def chown(self, path: str, owner: str = ..., group: str = ...) -> Any: ... + def close(self) -> Any: ... + @classmethod + def connect(cls, *args, **kwargs) -> Any: ... + def delete(self, path: str, recursive: bool = ...) -> Any: ... + def df(self) -> int: ... + def download(self, path: str, stream, buffer_size: int | None = ...) -> Any: ... + def exists(self, path: str) -> bool: ... + def get_capacity(self) -> int: ... + def get_space_used(self) -> int: ... + def info(self, path: str) -> dict: ... + def isdir(self, path: str) -> bool: ... + def isfile(self, path: str) -> bool: ... + @overload + def ls(self, path: str, full_info: Literal[True]) -> list[dict]: ... + @overload + def ls(self, path: str, full_info: Literal[False]) -> list[str]: ... + def mkdir(self, path: str) -> None: ... + def open( + self, + path: str, + mode: Literal["rb", "wb", "ab"] = ..., + buffer_size: int | None = ..., + replication: int | None = ..., + default_block_size: int | None = ..., + ) -> HdfsFile: ... + def rename(self, path: str, new_path: str) -> None: ... + def stat(self, path: str) -> dict[str, Any]: ... + def upload(self, path: str, stream, buffer_size: int | None = ...) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class HdfsFile(pyarrow.lib.NativeFile): + buffer_size: int + mode: str + parent: _HdfsFileNanny | None + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _HdfsFileNanny(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def have_libhdfs() -> bool: ... +def strip_hdfs_abspath(path: str) -> str: ... diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi new file mode 100644 index 00000000000..80a7ca77e18 --- /dev/null +++ b/pyarrow-stubs/_json.pyi @@ -0,0 +1,25 @@ +from typing import ( + Any, + ClassVar, +) + +import pyarrow.lib + +class ParseOptions(pyarrow.lib._Weakrefable): + __slots__: ClassVar[tuple] = ... + explicit_schema: Any + newlines_in_values: Any + unexpected_field_behavior: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + +class ReadOptions(pyarrow.lib._Weakrefable): + __slots__: ClassVar[tuple] = ... + block_size: Any + use_threads: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + +def read_json( + input_file, read_options=..., parse_options=..., MemoryPoolmemory_pool=... +) -> Any: ... diff --git a/pyarrow-stubs/_orc.pyi b/pyarrow-stubs/_orc.pyi new file mode 100644 index 00000000000..272d2c1db40 --- /dev/null +++ b/pyarrow-stubs/_orc.pyi @@ -0,0 +1,44 @@ +from typing import Any + +import pyarrow.lib + +_stringify_path: function + +class ORCReader(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def compression(self) -> Any: ... + def compression_size(self) -> Any: ... + def content_length(self) -> Any: ... + def file_footer_length(self) -> Any: ... + def file_length(self) -> Any: ... + def file_postscript_length(self) -> Any: ... + def file_version(self) -> Any: ... + def metadata(self) -> Any: ... + def nrows(self) -> Any: ... + def nstripe_statistics(self) -> Any: ... + def nstripes(self) -> Any: ... + def open(self, source, booluse_memory_map=...) -> Any: ... + def read(self, columns=...) -> Any: ... + def read_stripe(self, n, columns=...) -> Any: ... + def row_index_stride(self) -> Any: ... + def schema(self) -> Any: ... + def serialized_file_tail(self) -> Any: ... + def software_version(self) -> Any: ... + def stripe_statistics_length(self) -> Any: ... + def writer(self) -> Any: ... + def writer_version(self) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class ORCWriter(pyarrow.lib._Weakrefable): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def close(self) -> Any: ... + def open(self, *args, **kwargs) -> Any: ... + def write(self, Tabletable) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi new file mode 100644 index 00000000000..1325a57b5c3 --- /dev/null +++ b/pyarrow-stubs/_parquet.pyi @@ -0,0 +1,295 @@ +from typing import ( + Any, + ClassVar, + Generator, + Literal, +) + +import pyarrow.lib +from pyarrow.parquet.core import FileDecryptionProperties + +_stringify_path: function +indent: function + +class ArrowException(Exception): ... + +class ColumnChunkMetaData(pyarrow.lib._Weakrefable): + def __init__(self) -> None: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + def to_dict(self) -> Any: ... + def __eq__(self, other) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> str: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics: ... + @property + def compression( + self, + ) -> Literal[ + "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO", "BROTLI", "LZ4", "ZSTD", "UNKNOWN" + ]: ... + @property + def encodings( + self, + ) -> tuple[ + Literal[ + "PLAIN", + "BIT_PACKED", + "RLE", + "BYTE_STREAM_SPLIT", + "DELTA_BINARY_PACKED", + "DELTA_BYTE_ARRAY", + ], + ..., + ]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + +class ColumnSchema(pyarrow.lib._Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> Any: ... + def __eq__(self, other) -> Any: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> str: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> str | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + +class ParquetLogicalType(pyarrow.lib._Weakrefable): + type: Any + def to_json(self) -> str: ... + +class ParquetReader(pyarrow.lib._Weakrefable): + _column_idx_map: dict[bytes, int] | None + closed: bool + column_paths: Any + metadata: FileMetaData | None + num_row_groups: int + schema_arrow: pyarrow.lib.Schema + @classmethod + def __init__(self, memory_pool: pyarrow.lib.MemoryPool) -> None: ... + def close(self) -> None: ... + def column_name_idx(self, column_name: str) -> int: ... + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = ..., + use_threads: bool = ..., + ) -> Generator[pyarrow.lib.RecordBatch, None, None]: ... + def open( + self, + source, + *, + use_memory_map: bool = ..., + read_dictionary: list[str | int] | None = ..., + metadata: FileMetaData = ..., + buffer_size: int = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: str | None = ..., + decryption_properties: FileDecryptionProperties = ..., + thrift_string_size_limit: int = ..., + thrift_container_size_limit: int = ..., + ) -> pyarrow.lib.Table: ... + def read_all( + self, column_indices: list[int] | None = ..., use_threads: bool = ... + ) -> pyarrow.lib.Table: ... + def read_column(self, column_index: int) -> pyarrow.lib.Array: ... + def read_row_group( + self, i: int, column_indices: list[int] | None = ..., use_threads: bool = ... + ) -> pyarrow.lib.Table: ... + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = ..., + use_threads: bool = ..., + ) -> pyarrow.lib.Table: ... + def scan_contents( + self, column_indices: list[int] | None = ..., batch_size: int = ... + ) -> int: ... + def set_batch_size(self, batch_size: int) -> None: ... + def set_use_threads(self, use_threads: bool) -> None: ... + +class ParquetSchema(pyarrow.lib._Weakrefable): + names: list[str] + def __init__(self, container: FileMetaData) -> None: ... + def column(self, i: int) -> ColumnSchema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def to_arrow_schema(self) -> pyarrow.lib.Schema: ... + def __eq__(self, other) -> bool: ... + def __getitem__(self, i: int) -> ColumnSchema: ... + def __len__(self) -> int: ... + +class ParquetWriter(pyarrow.lib._Weakrefable): + allow_truncated_timestamps: Any + coerce_timestamps: Any + column_encoding: Any + compression: str | dict[str, str] + compression_level: Any + data_page_size: Any + data_page_version: Any + dictionary_pagesize_limit: Any + encryption_properties: Any + metadata: FileMetaData + row_group_size: Any + use_byte_stream_split: Any + use_compliant_nested_type: Any + use_deprecated_int96_timestamps: Any + use_dictionary: bool | list[str] + version: Any + write_batch_size: Any + write_statistics: Any + writer_engine_version: Any + def __init__( + cls, + where, + schema: pyarrow.lib.Schema, + use_dictionary: bool | list[str] | None = ..., + compression: str | dict[str, str] = ..., + version: str | None = ..., + write_statistics: bool | list[str] | None = ..., + memory_pool: pyarrow.lib.MemoryPool = ..., + use_deprecated_int96_timestamps: bool = ..., + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = ..., + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = ..., + column_encoding: str | dict[str, str] | None = ..., + writer_engine_version: Literal["V1", "V2"] | None = None, + data_page_version: Literal["1.0", "2.0"] | None = None, + use_compliant_nested_type: bool = ..., + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = ..., + dictionary_pagesize_limit: int | None = ..., + ) -> None: ... + def close(self) -> None: ... + def write_table( + self, table: pyarrow.lib.Table, row_group_size: int | None = ... + ) -> None: ... + +class RowGroupMetaData(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + num_columns: Any + num_rows: Any + total_byte_size: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def column(self, inti) -> Any: ... + def equals(self, RowGroupMetaDataother) -> Any: ... + def to_dict(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + +class Statistics(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + converted_type: Any + distinct_count: Any + has_distinct_count: Any + has_min_max: Any + has_null_count: Any + logical_type: Any + max: Any + max_raw: Any + min: Any + min_raw: Any + null_count: Any + num_values: Any + physical_type: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def equals(self, Statisticsother) -> Any: ... + def to_dict(self) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class FileDecryptionProperties: + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class FileEncryptionProperties: + __pyx_vtable__: ClassVar[PyCapsule] = ... + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + +class FileMetaData(pyarrow.lib._Weakrefable): + __hash__: ClassVar[None] = ... + __pyx_vtable__: ClassVar[PyCapsule] = ... + created_by: Any + format_version: Any + metadata: Any + num_columns: Any + num_row_groups: Any + num_rows: Any + schema: Any + serialized_size: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def append_row_groups(self, FileMetaDataother) -> Any: ... + def equals(self, FileMetaDataother) -> Any: ... + def row_group(self, inti) -> Any: ... + def set_file_path(self, path) -> Any: ... + def to_dict(self) -> Any: ... + def write_metadata_file(self, where) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + +def _datetime_from_int(int64_tvalue, TimeUnitunit, tzinfo=...) -> Any: ... +def _reconstruct_filemetadata(Bufferserialized) -> Any: ... +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_parquet_encryption.pyi b/pyarrow-stubs/_parquet_encryption.pyi new file mode 100644 index 00000000000..971365611f8 --- /dev/null +++ b/pyarrow-stubs/_parquet_encryption.pyi @@ -0,0 +1,111 @@ +import datetime +from typing import ( + Any, + ClassVar, +) + +import pyarrow.lib + +class ArrowException(Exception): ... + +class CryptoFactory(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + def __init__(self, *args, **kwargs) -> None: ... + def file_decryption_properties( + self, + KmsConnectionConfigkms_connection_config, + DecryptionConfigurationdecryption_config=..., + ) -> Any: ... + def file_encryption_properties( + self, + KmsConnectionConfigkms_connection_config, + EncryptionConfigurationencryption_config, + ) -> Any: ... + def remove_cache_entries_for_all_tokens(self) -> Any: ... + def remove_cache_entries_for_token(self, access_token) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class DecryptionConfiguration(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + cache_lifetime: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class EncryptionConfiguration(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + cache_lifetime: Any + column_keys: Any + data_key_length_bits: Any + double_wrapping: Any + encryption_algorithm: Any + footer_key: Any + internal_key_material: Any + plaintext_footer: Any + def __init__(self, *args, **kwargs) -> None: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class KmsClient(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + def __init__(self, *args, **kwargs) -> None: ... + def unwrap_key(self, wrapped_key, master_key_identifier) -> Any: ... + def wrap_key(self, key_bytes, master_key_identifier) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class KmsConnectionConfig(pyarrow.lib._Weakrefable): + __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... + custom_kms_conf: Any + key_access_token: Any + kms_instance_id: Any + kms_instance_url: Any + def __init__(self, *args, **kwargs) -> None: ... + def refresh_key_access_token(self, value) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class timedelta: + max: ClassVar[datetime.timedelta] = ... + min: ClassVar[datetime.timedelta] = ... + resolution: ClassVar[datetime.timedelta] = ... + days: Any + microseconds: Any + seconds: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def total_seconds(self, *args, **kwargs) -> Any: ... + def __abs__(self) -> Any: ... + def __add__(self, other) -> Any: ... + def __bool__(self) -> Any: ... + def __divmod__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __floordiv__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __gt__(self, other) -> Any: ... + def __hash__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __lt__(self, other) -> Any: ... + def __mod__(self, other) -> Any: ... + def __mul__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __neg__(self) -> Any: ... + def __pos__(self) -> Any: ... + def __radd__(self, other) -> Any: ... + def __rdivmod__(self, other) -> Any: ... + def __reduce__(self) -> Any: ... + def __rfloordiv__(self, other) -> Any: ... + def __rmod__(self, other) -> Any: ... + def __rmul__(self, other) -> Any: ... + def __rsub__(self, other) -> Any: ... + def __rtruediv__(self, other) -> Any: ... + def __sub__(self, other) -> Any: ... + def __truediv__(self, other) -> Any: ... + +def frombytes(*args, **kwargs) -> Any: ... +def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_plasma.pyi b/pyarrow-stubs/_plasma.pyi new file mode 100644 index 00000000000..1e519691a3e --- /dev/null +++ b/pyarrow-stubs/_plasma.pyi @@ -0,0 +1,113 @@ +import collections.abc +import socket +from typing import ( + Any, + ClassVar, + Literal, + TypedDict, + overload, +) + +import pyarrow.lib + +PLASMA_WAIT_TIMEOUT: int + +class _ListResult(TypedDict): + data_size: int + metadata_size: int + ref_count: int + create_time: float + construct_duration: int + state: Literal["created", "sealed"] + +class ArrowException(Exception): ... + +class ObjectID(pyarrow.lib._Weakrefable): + def __init__(self, object_id: bytes) -> None: ... + def binary(self) -> bytes: ... + @staticmethod + def from_random() -> ObjectID: ... + def __eq__(self, other) -> bool: ... + def __hash__(self) -> int: ... + +class ObjectNotAvailable(pyarrow.lib._Weakrefable): ... +class PlasmaBuffer(pyarrow.lib.Buffer): ... + +class PlasmaClient(pyarrow.lib._Weakrefable): + store_socket_name: str + def __init__(self) -> None: ... + def _release(self, object_id: ObjectID) -> None: ... + def contains(self, object_id: ObjectID) -> bool: ... + def create( + self, object_id: ObjectID, data_size: int, metadata: bytes = ... + ) -> pyarrow.lib.Buffer: ... + def create_and_seal( + self, object_id: ObjectID, data: bytes, metadata: bytes = ... + ) -> None: ... + def debug_string(self) -> str: ... + def decode_notifications( + self, buf: pyarrow.lib.Buffer + ) -> tuple[list[ObjectID], int, int]: ... + def delete(self, object_ids: list[ObjectID]) -> None: ... + def disconnect(self) -> None: ... + def evict(self, num_bytes: int) -> None: ... + @overload + def get( + self, + object_ids: ObjectID, + timeout_ms: int = ..., + serialization_context: pyarrow.lib.SerializationContext = ..., + ) -> Any: ... + @overload + def get( + self, + object_ids: list[ObjectID], + timeout_ms: int = ..., + serialization_context: pyarrow.lib.SerializationContext = ..., + ) -> list[Any]: ... + def get_buffers( + self, + object_ids: list[ObjectID], + timeout_ms: int = ..., + with_meta: bool = ..., + ) -> list[PlasmaBuffer | None | tuple[PlasmaBuffer | None, bytes]]: ... + def get_metadata( + self, object_ids: list[ObjectID], timeout_ms: int = ... + ) -> list[PlasmaBuffer | None]: ... + def get_next_notification(self) -> list[tuple[ObjectID, int, int]]: ... + def get_notification_socket(self) -> socket.socket: ... + def hash(self, object_id: ObjectID) -> bytes: ... + def list(self) -> _ListResult: ... + def put( + self, + value: Any, + object_id: ObjectID | None = ..., + memcopy_threads: int = ..., + serialization_context: pyarrow.lib.SerializationContext = ..., + ) -> ObjectID: ... + def put_raw_buffer( + self, + value: memoryview, + object_id: ObjectID | None = ..., + metadata: bytes = ..., + memcopy_threads: int = ..., + ) -> ObjectID: ... + def seal(self, object_id: ObjectID) -> None: ... + def set_client_options( + self, client_name: str, limit_output_memory: int + ) -> None: ... + def store_capacity(self) -> int: ... + def subscribe(self) -> None: ... + def to_capsule(self) -> Any: ... + +class PlasmaObjectExists(pyarrow.lib.ArrowException): ... +class PlasmaObjectNotFound(pyarrow.lib.ArrowException): ... +class PlasmaStoreFull(pyarrow.lib.ArrowException): ... + +def connect(store_socket_name: str, num_retries: int = ...) -> PlasmaClient: ... +def get_socket_from_fd( + fileno: int | None, + family: socket.AddressFamily | int, + type: socket.SocketKind | int, +) -> socket.socket: ... +def make_object_id(object_id: bytes) -> ObjectID: ... diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi new file mode 100644 index 00000000000..1084d839aca --- /dev/null +++ b/pyarrow-stubs/_s3fs.pyi @@ -0,0 +1,64 @@ +import enum +import importlib._bootstrap +from typing import ( + Any, + ClassVar, +) + +import pyarrow._fs +import pyarrow.lib + +Debug: importlib._bootstrap.S3LogLevel +Error: importlib._bootstrap.S3LogLevel +Fatal: importlib._bootstrap.S3LogLevel +Info: importlib._bootstrap.S3LogLevel +Off: importlib._bootstrap.S3LogLevel +Trace: importlib._bootstrap.S3LogLevel +Warn: importlib._bootstrap.S3LogLevel + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(pyarrow._fs.FileSystem): + region: str + def __init__( + self, + *, + access_key: str | None = ..., + secret_key: str | None = ..., + session_token: str | None = ..., + anonymous: bool = ..., + role_arn: str | None = ..., + session_name: str | None = ..., + external_id: str | None = ..., + load_frequency: int = ..., + region: str = ..., + request_timeout: float | None = ..., + connect_timeout: float | None = ..., + schema: str = ..., + endpoint_override: str | None = ..., + background_writes: bool = ..., + default_metadata: dict | pyarrow.lib.KeyValueMetadata = ..., + proxy_options: dict | str | None = ..., + allow_bucket_creation: bool = ..., + allow_bucket_deletion: bool = ..., + retry_strategy: S3RetryStrategy = ..., + ) -> None: ... + @classmethod + def _reconstruct(cls, kwargs: Any) -> S3FileSystem: ... + +class S3LogLevel(enum.IntEnum): + Debug: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Error: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Fatal: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Info: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Off: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Trace: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Warn: ClassVar[importlib._bootstrap.S3LogLevel] = ... + +class S3RetryStrategy: + def __init__(self, max_attempts: int = ...) -> None: ... + +def finalize_s3() -> None: ... +def initialize_s3(log_level: S3LogLevel = ...) -> Any: ... +def resolve_s3_region(bucket: str) -> str: ... diff --git a/pyarrow-stubs/_substrait.pyi b/pyarrow-stubs/_substrait.pyi new file mode 100644 index 00000000000..6ba7498b010 --- /dev/null +++ b/pyarrow-stubs/_substrait.pyi @@ -0,0 +1,16 @@ +from typing import ( + Callable, + NamedTuple, +) + +from pyarrow.lib import ( + Buffer, + RecordBatchReader, + Table, +) + +def _parse_json_plan(plan: bytes) -> Buffer: ... +def get_supported_functions() -> list[str]: ... +def run_query( + plan: Buffer | bytes, table_provider: Callable[[NamedTuple], Table] | None = ... +) -> RecordBatchReader: ... diff --git a/pyarrow-stubs/benchmark.pyi b/pyarrow-stubs/benchmark.pyi new file mode 100644 index 00000000000..0d2a20d9ae7 --- /dev/null +++ b/pyarrow-stubs/benchmark.pyi @@ -0,0 +1 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull as benchmark_PandasObjectIsNull diff --git a/pyarrow-stubs/cffi.pyi b/pyarrow-stubs/cffi.pyi new file mode 100644 index 00000000000..2ae945c5974 --- /dev/null +++ b/pyarrow-stubs/cffi.pyi @@ -0,0 +1,4 @@ +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi new file mode 100644 index 00000000000..c12a65dac79 --- /dev/null +++ b/pyarrow-stubs/compute.pyi @@ -0,0 +1,130 @@ +from typing import TypeVar + +from numpy.typing import ArrayLike +from pyarrow._compute import ( + ArraySortOptions as ArraySortOptions, + AssumeTimezoneOptions as AssumeTimezoneOptions, + CastOptions as CastOptions, + CountOptions as CountOptions, + CumulativeSumOptions as CumulativeSumOptions, + DayOfWeekOptions as DayOfWeekOptions, + DictionaryEncodeOptions as DictionaryEncodeOptions, + ElementWiseAggregateOptions as ElementWiseAggregateOptions, + Expression as Expression, + ExtractRegexOptions as ExtractRegexOptions, + FilterOptions as FilterOptions, + Function as Function, + FunctionOptions as FunctionOptions, + FunctionRegistry as FunctionRegistry, + HashAggregateFunction as HashAggregateFunction, + HashAggregateKernel as HashAggregateKernel, + IndexOptions as IndexOptions, + JoinOptions as JoinOptions, + Kernel as Kernel, + MakeStructOptions as MakeStructOptions, + MapLookupOptions as MapLookupOptions, + MatchSubstringOptions as MatchSubstringOptions, + ModeOptions as ModeOptions, + NullOptions as NullOptions, + PadOptions as PadOptions, + PartitionNthOptions as PartitionNthOptions, + QuantileOptions as QuantileOptions, + RandomOptions as RandomOptions, + RankOptions as RankOptions, + ReplaceSliceOptions as ReplaceSliceOptions, + ReplaceSubstringOptions as ReplaceSubstringOptions, + RoundOptions as RoundOptions, + RoundTemporalOptions as RoundTemporalOptions, + RoundToMultipleOptions as RoundToMultipleOptions, + ScalarAggregateFunction as ScalarAggregateFunction, + ScalarAggregateKernel as ScalarAggregateKernel, + ScalarAggregateOptions as ScalarAggregateOptions, + ScalarFunction as ScalarFunction, + ScalarKernel as ScalarKernel, + ScalarUdfContext as ScalarUdfContext, + SelectKOptions as SelectKOptions, + SetLookupOptions as SetLookupOptions, + SliceOptions as SliceOptions, + SortOptions as SortOptions, + SplitOptions as SplitOptions, + SplitPatternOptions as SplitPatternOptions, + StrftimeOptions as StrftimeOptions, + StrptimeOptions as StrptimeOptions, + StructFieldOptions as StructFieldOptions, + TakeOptions as TakeOptions, + TDigestOptions as TDigestOptions, + TrimOptions as TrimOptions, + Utf8NormalizeOptions as Utf8NormalizeOptions, + VarianceOptions as VarianceOptions, + VectorFunction as VectorFunction, + VectorKernel as VectorKernel, + WeekOptions as WeekOptions, + call_function as call_function, + function_registry as function_registry, + get_function as get_function, + list_functions as list_functions, + register_scalar_function as register_scalar_function, +) +from pyarrow.lib import ( + Array, + ChunkedArray, + DataType, + MemoryPool, + RecordBatch, + Scalar, + Table, +) +from pyarrow.vendored import docscrape as docscrape + +def cast( + arr: ArrayLike, + target_type: DataType | str | None = ..., + safe: bool | None = ..., + options: CastOptions | None = ..., +) -> Array: ... +def index( + data: ArrayLike, + value: Scalar, + start: int | None = ..., + end: int | None = ..., + *, + memory_pool: MemoryPool | None = ..., +) -> int: ... + +_TakeData = TypeVar("_TakeData", Array, ChunkedArray, RecordBatch, Table) + +def take( + data: _TakeData, + indices: Array | ChunkedArray, + *, + boundscheck: bool = ..., + memory_pool: MemoryPool | None = ..., +) -> _TakeData: ... + +_FillValues = TypeVar("_FillValues", bound=Array | ChunkedArray | Scalar) +_FillValue = TypeVar("_FillValue", bound=Array | ChunkedArray | Scalar) + +def fill_null(values: _FillValues, fill_value: _FillValue) -> _FillValues: ... +def top_k_unstable( + values: Array | ChunkedArray | RecordBatch | Table, + k: int, + sort_keys: list[str] | None = ..., + *, + memory_pool: MemoryPool | None = ..., +) -> Array: ... +def bottom_k_unstable( + values: Array | ChunkedArray | RecordBatch | Table, + k: int, + sort_keys: list[str] | None = ..., + *, + memory_pool: MemoryPool | None = ..., +) -> Array: ... +def random( + n: int, + *, + initializer: int | str = ..., + options: RandomOptions | None = ..., + memory_pool: MemoryPool | None = ..., +) -> Array: ... +def field(*name_or_index: int | str | tuple[int | str]): ... +def scalar(value: bool | int | float | str) -> Expression: ... diff --git a/pyarrow-stubs/csv.pyi b/pyarrow-stubs/csv.pyi new file mode 100644 index 00000000000..84c34f87bc5 --- /dev/null +++ b/pyarrow-stubs/csv.pyi @@ -0,0 +1,13 @@ +from pyarrow._csv import ( + ISO8601 as ISO8601, + ConvertOptions as ConvertOptions, + CSVStreamingReader as CSVStreamingReader, + CSVWriter as CSVWriter, + InvalidRow as InvalidRow, + ParseOptions as ParseOptions, + ReadOptions as ReadOptions, + WriteOptions as WriteOptions, + open_csv as open_csv, + read_csv as read_csv, + write_csv as write_csv, +) diff --git a/pyarrow-stubs/cuda.pyi b/pyarrow-stubs/cuda.pyi new file mode 100644 index 00000000000..8512e5a13e6 --- /dev/null +++ b/pyarrow-stubs/cuda.pyi @@ -0,0 +1,12 @@ +from pyarrow._cuda import ( + BufferReader as BufferReader, + BufferWriter as BufferWriter, + Context as Context, + CudaBuffer as CudaBuffer, + HostBuffer as HostBuffer, + IpcMemHandle as IpcMemHandle, + new_host_buffer as new_host_buffer, + read_message as read_message, + read_record_batch as read_record_batch, + serialize_record_batch as serialize_record_batch, +) diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi new file mode 100644 index 00000000000..68ec9657870 --- /dev/null +++ b/pyarrow-stubs/dataset.pyi @@ -0,0 +1,113 @@ +from os import PathLike +from typing import ( + Callable, + Iterable, + Literal, +) + +from pyarrow._dataset import ( + CsvFileFormat as CsvFileFormat, + CsvFragmentScanOptions as CsvFragmentScanOptions, + Dataset as Dataset, + DatasetFactory as DatasetFactory, + DirectoryPartitioning as DirectoryPartitioning, + FeatherFileFormat as FeatherFileFormat, + FileFormat as FileFormat, + FileFragment as FileFragment, + FilenamePartitioning as FilenamePartitioning, + FileSystemDataset as FileSystemDataset, + FileSystemDatasetFactory as FileSystemDatasetFactory, + FileSystemFactoryOptions as FileSystemFactoryOptions, + FileWriteOptions as FileWriteOptions, + Fragment as Fragment, + FragmentScanOptions as FragmentScanOptions, + HivePartitioning as HivePartitioning, + InMemoryDataset as InMemoryDataset, + IpcFileFormat as IpcFileFormat, + IpcFileWriteOptions as IpcFileWriteOptions, + Partitioning as Partitioning, + PartitioningFactory as PartitioningFactory, + Scanner as Scanner, + TaggedRecordBatch as TaggedRecordBatch, + UnionDataset as UnionDataset, + UnionDatasetFactory as UnionDatasetFactory, + WrittenFile as WrittenFile, +) +from pyarrow._dataset_orc import OrcFileFormat as OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory as ParquetDatasetFactory, + ParquetFactoryOptions as ParquetFactoryOptions, + ParquetFileFormat as ParquetFileFormat, + ParquetFileFragment as ParquetFileFragment, + ParquetFileWriteOptions as ParquetFileWriteOptions, + ParquetFragmentScanOptions as ParquetFragmentScanOptions, + ParquetReadOptions as ParquetReadOptions, + RowGroupInfo as RowGroupInfo, +) +from pyarrow.compute import ( + Expression as Expression, + field as field, + scalar as scalar, +) +from pyarrow.dataset import Dataset +from pyarrow.filesystem import FileSystem +from pyarrow.lib import ( + Array, + RecordBatch, + RecordBatchReader, + Schema, + Table, +) + +def __getattr__(name: str) -> None: ... +def partitioning( + schema: Schema | None = ..., + field_names: list[str] | None = ..., + flavor: str | None = ..., + dictionaries: dict[str, Array] | None = ..., +) -> Partitioning | PartitioningFactory: ... +def parquet_dataset( + metadata_path: str | PathLike, + schema: Schema | None = ..., + filesystem: FileSystem | str | None = ..., + format: ParquetFileFormat | str | None = ..., + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = ..., + partition_base_dir: str | None = ..., +) -> FileSystemDataset: ... +def dataset( + source: str | Dataset | Iterable[str | Dataset | RecordBatch | RecordBatchReader], + schema: Schema | None = ..., + format: FileFormat | str | None = ..., + filesystem: FileSystem | str | None = ..., + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = ..., + partition_base_dir: str | None = ..., + exclude_invalid_files: bool | None = ..., + ignore_prefixes: list[str] | None = ..., +) -> Dataset: ... +def write_dataset( + data: Dataset + | Table + | RecordBatch + | RecordBatchReader + | Iterable[Table | RecordBatch], + base_dir: str, + *, + basename_template: str | None = ..., + format: FileFormat | str | None = ..., + partitioning: Partitioning | list[str] | None = ..., + partitioning_flavor: str | None = ..., + schema: Schema | None = ..., + filesystem: FileSystem | None = ..., + file_options: FileWriteOptions | None = ..., + use_threads: bool = ..., + max_partitions: int | None = ..., + max_open_files: int | None = ..., + max_rows_per_file: int | None = ..., + min_rows_per_group: int | None = ..., + max_rows_per_group: int | None = ..., + file_visitor: Callable[[WrittenFile], None] | None = ..., + existing_data_behavior: Literal[ + "error", "overwrite_or_ignore", "delete_matching" + ] = ..., + create_dir: bool = ..., +) -> None: ... diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi new file mode 100644 index 00000000000..7052f9bbac6 --- /dev/null +++ b/pyarrow-stubs/feather.pyi @@ -0,0 +1,67 @@ +from io import IOBase +from typing import ( + Literal, + overload, +) + +import pandas as pd +from pyarrow._feather import FeatherError as FeatherError +from pyarrow.lib import ( + ChunkedArray, + Codec as Codec, + NativeFile, + Schema, + Table as Table, + concat_tables as concat_tables, + schema as schema, +) +from pyarrow.vendored.version import Version as Version + +class FeatherDataset: + paths: list[str] + validate_schema: bool + schema: Schema + def __init__( + self, path_or_paths: list[str], validate_schema: bool = ... + ) -> None: ... + def read_table(self, columns: list[str] | None = ...) -> Table: ... + def validate_schemas(self, piece: str, table: Table) -> None: ... + def read_pandas( + self, columns: list[str] | None = ..., use_threads: bool = ... + ) -> pd.DataFrame: ... + +def check_chunked_overflow(name: str, col: ChunkedArray) -> None: ... +def write_feather( + df: pd.DataFrame, + dest: str, + compression: Literal["zstd", "lz4", "uncompressed"] | None = ..., + compression_level: int | None = ..., + chunksize: int | None = ..., + version: int = ..., +) -> None: ... +@overload +def read_feather( + source: str, + columns: list[str] | None = ..., + use_threads: bool = ..., + memory_map: Literal[True] = ..., +) -> pd.DataFrame: ... +@overload +def read_feather( + source: str | NativeFile | IOBase, + columns: list[str] | None = ..., + use_threads: bool = ..., +) -> pd.DataFrame: ... +@overload +def read_table( + source: str | NativeFile | IOBase, + columns: list[str] | None = ..., + use_threads: bool = ..., +) -> Table: ... +@overload +def read_table( + source: str, + columns: list[str] | None = ..., + memory_map: Literal[True] = ..., + use_threads: bool = ..., +) -> Table: ... diff --git a/pyarrow-stubs/filesystem.pyi b/pyarrow-stubs/filesystem.pyi new file mode 100644 index 00000000000..72aa06229c6 --- /dev/null +++ b/pyarrow-stubs/filesystem.pyi @@ -0,0 +1,56 @@ +from os import PathLike +from typing import ( + Any, + Generator, +) + +from pyarrow import ( + Table, + parquet, +) +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import S3FileSystem + +class FileSystem: + def cat(self, path: str) -> bytes: ... + def ls(self, path: str) -> list[str]: ... + def delete(self, path: str, recursive: bool = ...) -> None: ... + def disk_usage(self, path: str) -> int: ... + def stat(self, path: str) -> dict: ... + def rm(self, path: str, recursive: bool = ...): ... + def mv(self, path: str, new_path: str): ... + def rename(self, path: str, new_path: str) -> None: ... + def mkdir(self, path: str, create_parents: bool = ...) -> None: ... + def exists(self, path: str) -> bool: ... + def isdir(self, path: str) -> bool: ... + def isfile(self, path: str) -> bool: ... + def read_parquet( + self, + path: str, + columns: list[str] | None = ..., + metadata: parquet.FileMetaData | None = ..., + schema: parquet.ParquetSchema | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ) -> Table: ... + def open(self, path: str, mode: str = ...) -> None: ... + @property + def pathsep(self) -> str: ... + +class LocalFileSystem(FileSystem): + def __init__(self) -> None: ... + @classmethod + def get_instance(cls) -> LocalFileSystem: ... + def walk( + self, path: str + ) -> Generator[tuple[str, list[str], list[str]], None, None]: ... + +class DaskFileSystem(FileSystem): + fs: S3FileSystem | GcsFileSystem + def __init__(self, fs: S3FileSystem | GcsFileSystem) -> None: ... + +class S3FSWrapper(DaskFileSystem): ... + +def resolve_filesystem_and_path( + where: str | PathLike, filesystem: FileSystem | None = ... +) -> tuple[FileSystem | None, str]: ... diff --git a/pyarrow-stubs/flight.pyi b/pyarrow-stubs/flight.pyi new file mode 100644 index 00000000000..557016ffdcd --- /dev/null +++ b/pyarrow-stubs/flight.pyi @@ -0,0 +1,47 @@ +from pyarrow._flight import ( + Action as Action, + ActionType as ActionType, + BasicAuth as BasicAuth, + CallInfo as CallInfo, + CertKeyPair as CertKeyPair, + ClientAuthHandler as ClientAuthHandler, + ClientMiddleware as ClientMiddleware, + ClientMiddlewareFactory as ClientMiddlewareFactory, + DescriptorType as DescriptorType, + FlightCallOptions as FlightCallOptions, + FlightCancelledError as FlightCancelledError, + FlightClient as FlightClient, + FlightDataStream as FlightDataStream, + FlightDescriptor as FlightDescriptor, + FlightEndpoint as FlightEndpoint, + FlightError as FlightError, + FlightInfo as FlightInfo, + FlightInternalError as FlightInternalError, + FlightMetadataReader as FlightMetadataReader, + FlightMetadataWriter as FlightMetadataWriter, + FlightMethod as FlightMethod, + FlightServerBase as FlightServerBase, + FlightServerError as FlightServerError, + FlightStreamChunk as FlightStreamChunk, + FlightStreamReader as FlightStreamReader, + FlightStreamWriter as FlightStreamWriter, + FlightTimedOutError as FlightTimedOutError, + FlightUnauthenticatedError as FlightUnauthenticatedError, + FlightUnauthorizedError as FlightUnauthorizedError, + FlightUnavailableError as FlightUnavailableError, + FlightWriteSizeExceededError as FlightWriteSizeExceededError, + GeneratorStream as GeneratorStream, + Location as Location, + MetadataRecordBatchReader as MetadataRecordBatchReader, + MetadataRecordBatchWriter as MetadataRecordBatchWriter, + RecordBatchStream as RecordBatchStream, + Result as Result, + SchemaResult as SchemaResult, + ServerAuthHandler as ServerAuthHandler, + ServerCallContext as ServerCallContext, + ServerMiddleware as ServerMiddleware, + ServerMiddlewareFactory as ServerMiddlewareFactory, + Ticket as Ticket, + TracingServerMiddlewareFactory as TracingServerMiddlewareFactory, + connect as connect, +) diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi new file mode 100644 index 00000000000..4a0c92e7d54 --- /dev/null +++ b/pyarrow-stubs/fs.pyi @@ -0,0 +1,58 @@ +from _typeshed import Incomplete +from pyarrow import PythonFile +from pyarrow._fs import ( + FileInfo as FileInfo, + FileSelector as FileSelector, + FileSystem as FileSystem, + FileSystemHandler as FileSystemHandler, + FileType as FileType, + LocalFileSystem as LocalFileSystem, + PyFileSystem as PyFileSystem, + SubTreeFileSystem as SubTreeFileSystem, +) +from pyarrow._gcsfs import GcsFileSystem as GcsFileSystem +from pyarrow._hdfs import HadoopFileSystem as HadoopFileSystem +from pyarrow._s3fs import ( + AwsDefaultS3RetryStrategy as AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy as AwsStandardS3RetryStrategy, + S3FileSystem as S3FileSystem, + S3LogLevel as S3LogLevel, + S3RetryStrategy as S3RetryStrategy, + finalize_s3 as finalize_s3, + initialize_s3 as initialize_s3, + resolve_s3_region as resolve_s3_region, +) + +FileStats = FileInfo + +def __getattr__(name: str) -> None: ... +def copy_files( + source: str, + destination: str, + source_filesystem: FileSystem | None = ..., + destination_filesystem: FileSystem | None = ..., + *, + chunk_size: int = ..., + use_threads: bool = ..., +) -> None: ... + +class FSSpecHandler(FileSystemHandler): + fs: Incomplete + def __init__(self, fs) -> None: ... + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + def get_type_name(self) -> str: ... + def normalize_path(self, path: str) -> str: ... + def get_file_info(self, paths: list[str]) -> list[FileInfo]: ... + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + def create_dir(self, path: str, recursive: bool) -> None: ... + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool) -> None: ... # type: ignore + def delete_root_dir_contents(self) -> None: ... + def delete_file(self, path: str) -> None: ... + def move(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: ... + def open_input_stream(self, path: str) -> PythonFile: ... + def open_input_file(self, path: str) -> PythonFile: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> PythonFile: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> PythonFile: ... diff --git a/pyarrow-stubs/hdfs.pyi b/pyarrow-stubs/hdfs.pyi new file mode 100644 index 00000000000..a51797ffb46 --- /dev/null +++ b/pyarrow-stubs/hdfs.pyi @@ -0,0 +1,33 @@ +from collections.abc import Generator +from typing import ( + Literal, + overload, +) + +from _typeshed import Incomplete +import pyarrow._hdfsio as _hdfsio +from pyarrow.filesystem import FileSystem as FileSystem +from pyarrow.util import implements as implements + +class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): + def __init__( + self, + host: str = ..., + port: int = ..., + user: str | None = ..., + kerb_ticket: Incomplete | None = ..., + driver: str = ..., + extra_conf: Incomplete | None = ..., + ) -> None: ... + def __reduce__(self) -> tuple: ... + def walk( + self, top_path: str + ) -> Generator[tuple[str, list[str], list[str]], None, None]: ... + +def connect( + host: str = ..., + port: int = ..., + user: Incomplete | None = ..., + kerb_ticket: Incomplete | None = ..., + extra_conf: Incomplete | None = ..., +): ... diff --git a/pyarrow-stubs/ipc.pyi b/pyarrow-stubs/ipc.pyi new file mode 100644 index 00000000000..bc424f734c5 --- /dev/null +++ b/pyarrow-stubs/ipc.pyi @@ -0,0 +1,102 @@ +from io import IOBase + +import pandas as pd +from pyarrow import ipc +import pyarrow.lib as lib +from pyarrow.lib import ( + Buffer, + IpcReadOptions as IpcReadOptions, + IpcWriteOptions as IpcWriteOptions, + MemoryPool, + Message as Message, + MessageReader as MessageReader, + MetadataVersion as MetadataVersion, + NativeFile, + ReadStats as ReadStats, + RecordBatchReader as RecordBatchReader, + Schema, + WriteStats as WriteStats, + get_record_batch_size as get_record_batch_size, + get_tensor_size as get_tensor_size, + read_message as read_message, + read_record_batch as read_record_batch, + read_schema as read_schema, + read_tensor as read_tensor, + write_tensor as write_tensor, +) + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | memoryview | Buffer | NativeFile | IOBase, + *, + options: ipc.IpcReadOptions | None = ..., + memory_pool: MemoryPool | None = ..., + ) -> None: ... + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | Buffer | NativeFile | IOBase, + schema: Schema, + *, + use_legacy_format: bool | None = ..., + options: ipc.IpcWriteOptions | None = ..., + ) -> None: ... + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | memoryview | Buffer | NativeFile | IOBase, + footer_offset: int | None = ..., + *, + options: ipc.IpcReadOptions | None = ..., + memory_pool: MemoryPool | None = ..., + ) -> None: ... + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | Buffer | NativeFile | IOBase, + schema: Schema, + *, + use_legacy_format: bool | None = ..., + options: ipc.IpcWriteOptions | None = ..., + ) -> None: ... + +def new_stream( + sink: str | Buffer | NativeFile | IOBase, + schema: Schema, + *, + use_legacy_format: bool | None = ..., + options: ipc.IpcWriteOptions | None = ..., +) -> RecordBatchStreamWriter: ... +def open_stream( + source: bytes | memoryview | Buffer | NativeFile | IOBase, + *, + options: ipc.IpcReadOptions | None = ..., + memory_pool: MemoryPool | None = ..., +) -> RecordBatchStreamReader: ... +def new_file( + sink: str | NativeFile | IOBase, + schema: Schema, + *, + use_legacy_format: bool | None = ..., + options: ipc.IpcWriteOptions | None = ..., +) -> RecordBatchFileWriter: ... +def open_file( + source: bytes | memoryview | Buffer | NativeFile | IOBase, + footer_offset: int | None = ..., + *, + options: ipc.IpcReadOptions | None = ..., + memory_pool: MemoryPool | None = ..., +) -> RecordBatchFileReader: ... +def serialize_pandas( + df: pd.DataFrame, + *, + nthreads: int | None = ..., + preserve_index: bool | None = ..., +) -> Buffer: ... +def deserialize_pandas( + buf: memoryview | Buffer, *, use_threads: bool = ... +) -> pd.DataFrame: ... diff --git a/pyarrow-stubs/json.pyi b/pyarrow-stubs/json.pyi new file mode 100644 index 00000000000..6d83ce4f85c --- /dev/null +++ b/pyarrow-stubs/json.pyi @@ -0,0 +1,5 @@ +from pyarrow._json import ( + ParseOptions as ParseOptions, + ReadOptions as ReadOptions, + read_json as read_json, +) diff --git a/pyarrow-stubs/jvm.pyi b/pyarrow-stubs/jvm.pyi new file mode 100644 index 00000000000..43ebf5b7845 --- /dev/null +++ b/pyarrow-stubs/jvm.pyi @@ -0,0 +1,19 @@ +from _typeshed import Incomplete +from pyarrow.lib import ( + Array, + Buffer, + Field, + RecordBatch, + Schema, +) + +class _JvmBufferNanny: + ref_manager: Incomplete + def __init__(self, jvm_buf) -> None: ... + def __del__(self) -> None: ... + +def jvm_buffer(jvm_buf) -> Buffer: ... +def field(jvm_field) -> Field: ... +def schema(jvm_schema) -> Schema: ... +def array(jvm_array) -> Array: ... +def record_batch(jvm_vector_schema_root) -> RecordBatch: ... diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi new file mode 100644 index 00000000000..40ab550d4c4 --- /dev/null +++ b/pyarrow-stubs/lib.pyi @@ -0,0 +1,2229 @@ +import collections.abc +import datetime as dt +from decimal import Decimal +import enum +import importlib._bootstrap +import io +from os import PathLike +from types import ModuleType +from typing import ( + Any, + Callable, + ClassVar, + Generator, + Generic, + ItemsView, + Iterable, + KeysView, + Literal, + NamedTuple, + TypeAlias, + TypeGuard, + TypeVar, + ValuesView, + overload, +) + +import _io # type: ignore +import numpy as np +from numpy.typing import ( + ArrayLike, + DTypeLike, + NDArray, +) +import pandas as pd +from pyarrow.compute import ( + CastOptions, + FunctionOptions, +) + +_ArrowType: TypeAlias = int | DataType +_builtin_slice = slice +DEFAULT_BUFFER_SIZE: int +NA: NullScalar +Type_BINARY: _ArrowType +Type_BOOL: _ArrowType +Type_DATE32: _ArrowType +Type_DATE64: _ArrowType +Type_DECIMAL128: _ArrowType +Type_DECIMAL256: _ArrowType +Type_DENSE_UNION: _ArrowType +Type_DICTIONARY: _ArrowType +Type_DOUBLE: _ArrowType +Type_DURATION: _ArrowType +Type_FIXED_SIZE_BINARY: _ArrowType +Type_FIXED_SIZE_LIST: _ArrowType +Type_FLOAT: _ArrowType +Type_HALF_FLOAT: _ArrowType +Type_INT16: _ArrowType +Type_INT32: _ArrowType +Type_INT64: _ArrowType +Type_INT8: _ArrowType +Type_INTERVAL_MONTH_DAY_NANO: _ArrowType +Type_LARGE_BINARY: _ArrowType +Type_LARGE_LIST: _ArrowType +Type_LARGE_STRING: _ArrowType +Type_LIST: _ArrowType +Type_MAP: _ArrowType +Type_NA: _ArrowType +Type_SPARSE_UNION: _ArrowType +Type_STRING: _ArrowType +Type_STRUCT: _ArrowType +Type_TIME32: _ArrowType +Type_TIME64: _ArrowType +Type_TIMESTAMP: _ArrowType +Type_UINT16: _ArrowType +Type_UINT32: _ArrowType +Type_UINT64: _ArrowType +Type_UINT8: _ArrowType +UnionMode_DENSE: int +UnionMode_SPARSE: int +V1: importlib._bootstrap.MetadataVersion +V2: importlib._bootstrap.MetadataVersion +V3: importlib._bootstrap.MetadataVersion +V4: importlib._bootstrap.MetadataVersion +V5: importlib._bootstrap.MetadataVersion +_NULL: NullScalar +__pc: ModuleType | None +_break_traceback_cycle_from_frame: function +_default_context_initialized: bool +_default_serialization_context: SerializationContext +_is_path_like: function +_pandas_api: _PandasAPIShim +_python_extension_types_registry: list +_registry_nanny: _ExtensionRegistryNanny +_stringify_path: function +contextmanager: function +cpp_build_info: importlib._bootstrap.BuildInfo +cpp_version: str +cpp_version_info: importlib._bootstrap.VersionInfo +have_signal_refcycle: bool +namedtuple: function + +class PyCapsule: ... + +_Self = TypeVar("_Self") + +_Array = TypeVar("_Array", bound="Array") +_ChunkedArray = TypeVar("_ChunkedArray", bound=ChunkedArray) + +_T = TypeVar("_T") +_T2 = TypeVar("_T2") +_Scalar = TypeVar("_Scalar", bound=Scalar) + +class Array(_PandasConvertible, Generic[_T, _Scalar]): + _name: Any + nbytes: int + null_count: int + offset: int + type: DataType[_T] + def __init__(self) -> None: ... + def _debug_print(self) -> Any: ... + @staticmethod + def _export_to_c(out_ptr: int, out_schema_ptr: int | None = ...) -> Array: ... + @staticmethod + def _import_from_c(in_ptr: int, type: DataType | int) -> Array: ... + def _to_pandas( + self, + options: dict[str, Any], + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype | None] + | None = ..., + **kwargs, + ) -> pd.Series: ... + def buffers(self) -> list[Buffer | None]: ... + @overload + def cast( + self, + target_type: Literal["bool", "boolean"], + safe: bool = ..., + options: CastOptions = ..., + ) -> BooleanArray: ... + @overload + def cast( + self, + target_type: Literal["i1", "int8"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Int8Array: ... + @overload + def cast( + self, + target_type: Literal["i2", "int16"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Int16Array: ... + @overload + def cast( + self, + target_type: Literal["i4", "int32"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Int32Array: ... + @overload + def cast( + self, + target_type: Literal["i8", "int64"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Int64Array: ... + @overload + def cast( + self, + target_type: Literal["u1", "uint8"], + safe: bool = ..., + options: CastOptions = ..., + ) -> UInt8Array: ... + @overload + def cast( + self, + target_type: Literal["u2", "uint16"], + safe: bool = ..., + options: CastOptions = ..., + ) -> UInt16Array: ... + @overload + def cast( + self, + target_type: Literal["u4", "uint32"], + safe: bool = ..., + options: CastOptions = ..., + ) -> UInt32Array: ... + @overload + def cast( + self, + target_type: Literal["u8", "uint64"], + safe: bool = ..., + options: CastOptions = ..., + ) -> UInt64Array: ... + @overload + def cast( + self, + target_type: Literal["f2", "halffloat", "float16"], + safe: bool = ..., + options: CastOptions = ..., + ) -> HalfFloatArray: ... + @overload + def cast( + self, + target_type: Literal["f4", "float", "float32"], + safe: bool = ..., + options: CastOptions = ..., + ) -> FloatArray: ... + @overload + def cast( + self, + target_type: Literal["f8", "double", "float64"], + safe: bool = ..., + options: CastOptions = ..., + ) -> DoubleArray: ... + @overload + def cast( + self, + target_type: Literal["string", "str", "utf8"], + safe: bool = ..., + options: CastOptions = ..., + ) -> StringArray: ... + @overload + def cast( + self, + target_type: Literal["binary"], + safe: bool = ..., + options: CastOptions = ..., + ) -> BinaryArray: ... + @overload + def cast( + self, + target_type: Literal["large_string", "large_str", "large_utf8"], + safe: bool = ..., + options: CastOptions = ..., + ) -> LargeStringArray: ... + @overload + def cast( + self, + target_type: Literal["large_binary"], + safe: bool = ..., + options: CastOptions = ..., + ) -> LargeBinaryArray: ... + @overload + def cast( + self, + target_type: Literal["date32", "date32[day]"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Date32Array: ... + @overload + def cast( + self, + target_type: Literal["date64", "date64[ms]"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Date64Array: ... + @overload + def cast( + self, + target_type: Literal["time32[s]", "time32[ms]"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Time32Array: ... + @overload + def cast( + self, + target_type: Literal["time64[us]", "time64[ns]"], + safe: bool = ..., + options: CastOptions = ..., + ) -> Time64Array: ... + @overload + def cast( + self, + target_type: Literal[ + "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" + ], + safe: bool = ..., + options: CastOptions = ..., + ) -> TimestampArray: ... + @overload + def cast( + self, + target_type: Literal[ + "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" + ], + safe: bool = ..., + options: CastOptions = ..., + ) -> DurationArray: ... + @overload + def cast( + self, + target_type: Literal["month_day_nano_interval"], + safe: bool = ..., + options: CastOptions = ..., + ) -> MonthDayNanoIntervalArray: ... + @overload + def cast( + self, + target_type: DataType[_T2] | None = ..., + safe: bool = ..., + options: CastOptions = ..., + ) -> Array[_T2, Scalar[_T2]]: ... + def dictionary_encode(self, null_encoding: str = ...) -> DictionaryArray: ... + def diff(self, other: Array) -> str: ... + def drop_null(self: _Array) -> _Array: ... + def equals(self, other: Array) -> bool: ... + def fill_null(self: _Array, fill_value: _T) -> _Array: ... + def filter( + self: _Array, + mask: list[bool] | BooleanArray, + *, + null_selection_behavior: Literal["drop", "emit_null"] = ..., + ) -> _Array: ... + def format(self, **kwargs) -> Any: ... + @staticmethod + def from_buffers( + type: DataType, + length: int, + buffers: list[Buffer], + null_count: int = ..., + offset: int = ..., + children: list[_Array] = ..., + ) -> _Array: ... + @staticmethod + def from_pandas( + obj: pd.Series | ArrayLike, + mask: BooleanArray = ..., + type: DataType[_T2] = ..., + safe: bool = ..., + memory_pool: MemoryPool = ..., + ) -> Array[_T2, Scalar[_T2]] | ChunkedArray[_T2, Scalar[_T2]]: ... + def get_total_buffer_size(self) -> int: ... + def index( + self, + value: Scalar | object, + start: int | None = ..., + end: int | None = ..., + *, + memory_pool: MemoryPool | None = ..., + ) -> Int64Scalar: ... + def is_null(self, *, nan_is_null: bool = ...) -> BooleanArray: ... + def is_valid(self) -> BooleanArray: ... + def slice(self: _Array, offset: int = ..., length: int | None = ...) -> _Array: ... + def sum(self, **kwargs) -> Any: ... + def take( + self: _Array, + indices: list[int] + | IntegerArray + | NDArray[np.signedinteger | np.unsignedinteger], + ) -> _Array: ... + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> NDArray: ... + def to_pylist(self) -> list[_T]: ... + def to_string( + self, + *, + indent: int = ..., + top_level_indent: int = ..., + window: int = ..., + container_window: int = ..., + skip_new_lines: bool = ..., + ) -> str: ... + def tolist(self) -> list[_T]: ... + def unique(self: _Array) -> _Array: ... + def validate(self, *, full: bool = ...) -> None: ... + def value_counts(self) -> StructArray: ... + @overload + def view(self, target_type: Literal["bool", "boolean"]) -> BooleanArray: ... + @overload + def view(self, target_type: Literal["i1", "int8"]) -> Int8Array: ... + @overload + def view(self, target_type: Literal["i2", "int16"]) -> Int16Array: ... + @overload + def view(self, target_type: Literal["i4", "int32"]) -> Int32Array: ... + @overload + def view(self, target_type: Literal["i8", "int64"]) -> Int64Array: ... + @overload + def view(self, target_type: Literal["u1", "uint8"]) -> UInt8Array: ... + @overload + def view(self, target_type: Literal["u2", "uint16"]) -> UInt16Array: ... + @overload + def view(self, target_type: Literal["u4", "uint32"]) -> UInt32Array: ... + @overload + def view(self, target_type: Literal["u8", "uint64"]) -> UInt64Array: ... + @overload + def view( + self, target_type: Literal["f2", "halffloat", "float16"] + ) -> HalfFloatArray: ... + @overload + def view(self, target_type: Literal["f4", "float", "float32"]) -> FloatArray: ... + @overload + def view(self, target_type: Literal["f8", "double", "float64"]) -> DoubleArray: ... + @overload + def view(self, target_type: Literal["string", "str", "utf8"]) -> StringArray: ... + @overload + def view(self, target_type: Literal["binary"]) -> BinaryArray: ... + @overload + def view( + self, target_type: Literal["large_string", "large_str", "large_utf8"] + ) -> LargeStringArray: ... + @overload + def view(self, target_type: Literal["large_binary"]) -> LargeBinaryArray: ... + @overload + def view(self, target_type: Literal["date32", "date32[day]"]) -> Date32Array: ... + @overload + def view(self, target_type: Literal["date64", "date64[ms]"]) -> Date64Array: ... + @overload + def view(self, target_type: Literal["time32[s]", "time32[ms]"]) -> Time32Array: ... + @overload + def view(self, target_type: Literal["time64[us]", "time64[ns]"]) -> Time64Array: ... + @overload + def view( + self, + target_type: Literal[ + "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" + ], + ) -> TimestampArray: ... + @overload + def view( + self, + target_type: Literal[ + "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" + ], + ) -> DurationArray: ... + @overload + def view( + self, + target_type: Literal["month_day_nano_interval"], + ) -> MonthDayNanoIntervalArray: ... + @overload + def view(self, target_type: DataType[_T2]) -> Array[_T2, Scalar[_T2]]: ... + def __array__(self, dtype: DTypeLike = ...) -> NDArray: ... + def __eq__(self, other) -> bool: ... + @overload + def __getitem__(self, key: int) -> _Scalar: ... + @overload + def __getitem__(self: _Array, key: _builtin_slice) -> _Array: ... + def __iter__(self) -> Generator[_Scalar, None, None]: ... + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + +class ArrowCancelled(ArrowException): + def __init__(self, message: str, signum: int = ...) -> None: ... + +class ArrowCapacityError(ArrowException): ... +class ArrowException(Exception): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... + +ArrowIOError = IOError + +class BaseExtensionType(DataType[_T]): + extension_name: str + storage_type: DataType[_T] + def __init__(self, *args, **kwargs) -> None: ... + @overload + def wrap_array( + self, storage: Array[_T2, _Scalar] + ) -> ExtensionArray[_T, ExtensionScalar[_T], Array[_T2, _Scalar]]: ... + @overload + def wrap_array( + self, storage: ChunkedArray[_T2, _Scalar] + ) -> ChunkedArray[_T, ExtensionScalar[_T]]: ... + +class BaseListArray(Array[list[_T], _Scalar]): + def __init__(cls, *args, **kwargs) -> None: ... + def flatten(self) -> Array[_T, _Scalar]: ... + def value_lengths(self) -> Int32Array: ... + def value_parent_indices(self) -> Int64Array: ... + +class BinaryArray(Array[_T, BinaryScalar]): + total_values_length: int + +class BinaryScalar(Scalar[_T]): + def as_buffer(self) -> Buffer: ... + def as_py(self) -> _T: ... + +class BooleanArray(Array[bool, BooleanScalar]): + false_count: int + true_count: int + +class BooleanScalar(Scalar[bool]): ... + +class Buffer(_Weakrefable): + address: int + is_cpu: bool + is_mutable: bool + parent: Buffer | None + size: int + def equals(self, other) -> bool: ... + def hex(self) -> bytes: ... + def slice(self, offset: int = ..., length: int | None = ...) -> Buffer: ... + def to_pybytes(self) -> bytes: ... + def __eq__(self, other) -> bool: ... + @overload + def __getitem__(self, key: int) -> int: ... + @overload + def __getitem__(self, key: _builtin_slice) -> Buffer: ... + def __len__(self) -> int: ... + +class BufferOutputStream(NativeFile): + def getvalue(self) -> Buffer: ... + +class BufferReader(NativeFile): ... +class BufferedIOBase(_io._BufferedIOBase, io.IOBase): ... + +class BufferedInputStream(NativeFile): + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = ... + ) -> None: ... + def detach(self) -> NativeFile: ... + +class BufferedOutputStream(NativeFile): + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = ... + ) -> None: ... + def detach(self) -> NativeFile: ... + +class BuildInfo(NamedTuple): + build_type: str + compiler_flags: str + compiler_id: str + compiler_version: str + full_so_version: str + git_description: str + git_id: str + package_kind: str + so_version: str + version: str + version_info: str + +class ChunkedArray(_PandasConvertible, Generic[_T, _Scalar]): + _name: str | None + chunks: list[Array[_T, _Scalar]] + nbytes: int + null_count: int + num_chunks: int + type: DataType[_T] + @property + def data(self: _ChunkedArray) -> _ChunkedArray: ... + def _to_pandas(self, options, types_mapper=..., **kwargs) -> Any: ... + def cast(self, target_type=..., safe=..., options=...) -> Any: ... + def chunk(self, i: int) -> Array[_T, _Scalar]: ... + def combine_chunks(self, memory_pool: MemoryPool | None = ...) -> Table: ... + def dictionary_encode( + self: _ChunkedArray, null_encoding: str = ... + ) -> _ChunkedArray: ... + def drop_null(self: _ChunkedArray) -> _ChunkedArray: ... + def equals(self, other) -> bool: ... + def fill_null(self: _ChunkedArray, fill_value: _T) -> _ChunkedArray: ... + def filter( + self: _ChunkedArray, + mask: list[bool] | BooleanArray, + *, + null_selection_behavior: Literal["drop", "emit_null"] = ..., + ) -> _ChunkedArray: ... + def flatten( + self: _ChunkedArray, memory_pool: MemoryPool | None = ... + ) -> _ChunkedArray: ... + def format(self, **kwargs) -> str: ... + def get_total_buffer_size(self) -> int: ... + def index( + self, + value: Scalar | object, + start: int | None = ..., + end: int | None = ..., + *, + memory_pool: MemoryPool | None = ..., + ) -> Int64Scalar: ... + def is_null(self) -> ChunkedArray[bool, BooleanScalar]: ... + def is_valid(self) -> ChunkedArray[bool, BooleanScalar]: ... + def iterchunks(self) -> Generator[Array[_T, _Scalar], None, None]: ... + def length(self) -> int: ... + def slice( + self: _ChunkedArray, offset: int = ..., length: int | None = ... + ) -> _ChunkedArray: ... + def take( + self: _ChunkedArray, + indices: list[int] + | IntegerArray + | NDArray[np.signedinteger | np.unsignedinteger], + ) -> _ChunkedArray: ... + def to_numpy(self) -> NDArray: ... + def to_pylist(self) -> list[_T]: ... + def to_string( + self, + *, + indent: int = ..., + window: int = ..., + container_window: int = ..., + skip_new_lines: bool = ..., + ) -> str: ... + def unify_dictionaries( + self: _ChunkedArray, memory_pool: MemoryPool = ... + ) -> _ChunkedArray: ... + def unique(self) -> ChunkedArray[int, Int64Scalar]: ... + def validate(self, *, full: bool = ...) -> None: ... + def value_counts(self) -> StructArray: ... + def __array__(self, dtype: DTypeLike = ...) -> NDArray: ... + def __eq__(self, other) -> bool: ... + @overload + def __getitem__(self, key: int) -> _Scalar: ... + @overload + def __getitem__(self: _ChunkedArray, key: _builtin_slice) -> _ChunkedArray: ... + def __iter__(self) -> Generator[_Scalar, None, None]: ... + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + +_COMPRESSION = Literal[ + "gzip", "bz2", "brotli", "lz4" "lz4_frame", "lz4_raw", "zstd", "snappy" +] + +class Codec(_Weakrefable): + compression_level: int | None + name: str + def __init__( + self, + compression: _COMPRESSION, + compression_level: int | None = ..., + ) -> None: ... + @overload + def compress( + self, + buf: Buffer | bytes | memoryview, + memory_pool: MemoryPool | None = ..., + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | memoryview, + asbytes: Literal[True] = ..., + memory_pool: MemoryPool | None = ..., + ) -> bytes: ... + def decompress( + self, buf, decompressed_size=..., asbytes=..., memory_pool=... + ) -> Any: ... + @staticmethod + def default_compression_level(compression: _COMPRESSION) -> int: ... + @staticmethod + def detect(path: str | PathLike) -> Codec: ... + @staticmethod + def is_available(compression: _COMPRESSION) -> bool: ... + @staticmethod + def maximum_compression_level(compression: _COMPRESSION) -> int: ... + @staticmethod + def minimum_compression_level(compression: _COMPRESSION) -> int: ... + @staticmethod + def supports_compression_level(compression: _COMPRESSION) -> bool: ... + +class CompressedInputStream(NativeFile): + def __init__( + self, + stream: str | PathLike | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class CompressedOutputStream(NativeFile): + def __init__( + self, + stream: str | PathLike | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class DataType(_Weakrefable, Generic[_T]): + bit_width: int + id: int + num_buffers: int + num_fields: int + def _export_to_c(self, out_ptr: int) -> None: ... + def _import_from_c(self, in_ptr: int) -> Any: ... + def equals(self, other) -> bool: ... + def field(self, i: int) -> Field: ... + def to_pandas_dtype(self) -> DTypeLike: ... + def __eq__(self, other) -> bool: ... + +class Date32Array(NumericArray[dt.date, Date32Scalar]): ... +class Date32Scalar(Scalar[dt.date]): ... +class Date64Array(NumericArray[dt.date, Date64Scalar]): ... +class Date64Scalar(Scalar[dt.date]): ... +class Decimal128Array(FixedSizeBinaryArray): ... +class Decimal128Scalar(Scalar[Decimal]): ... + +class Decimal128Type(FixedSizeBinaryType): + precision: int + scale: int + +class Decimal256Array(FixedSizeBinaryArray): ... +class Decimal256Scalar(Scalar[Decimal]): ... + +class Decimal256Type(FixedSizeBinaryType): + precision: int + scale: int + +class DenseUnionType(UnionType): ... + +class DeserializationCallbackError(ArrowSerializationError): + def __init__(self, message: str, type_id) -> None: ... + +class DictionaryArray(Array[dict, DictionaryScalar]): + dictionary: Any + indices: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def dictionary_decode(self: _Array) -> _Array: ... + def dictionary_encode(self) -> DictionaryArray: ... # type: ignore + @staticmethod + def from_arrays( + indices: Array | NDArray | pd.Series, + dictionary: Array | NDArray | pd.Series, + mask: NDArray | pd.Series = ..., + ordered: bool = ..., + from_pandas: bool = ..., + safe: bool = ..., + memory_pool: MemoryPool = ..., + ) -> DictionaryArray: ... + @staticmethod + def from_buffers( # type: ignore + type: DataType, + length: int, + buffers: list[Buffer], + dictionary: Array | NDArray | pd.Series, + null_count: int = ..., + offset: int = ..., + ) -> DictionaryArray: ... + +class DictionaryMemo(_Weakrefable): ... + +class DictionaryScalar(Scalar[dict]): + dictionary: Any + index: Any + value: Any + +class DictionaryType(DataType): + index_type: Any + ordered: Any + value_type: Any + +class DoubleArray(FloatingPointArray[DoubleScalar]): ... +class DoubleScalar(Scalar[float]): ... +class DurationArray(NumericArray[dt.timedelta, DurationScalar]): ... + +class DurationScalar(Scalar[dt.timedelta]): + value: Any + +class DurationType(DataType[dt.timedelta]): + unit: Literal["s", "ms", "us", "ns"] + +_StorageArray = TypeVar("_StorageArray", bound=Array) + +class ExtensionArray(Array, Generic[_T, _Scalar, _StorageArray]): + storage: _StorageArray + @staticmethod + def from_storage( + typ: BaseExtensionType[_T], value: Array[_T, Scalar[_T]] + ) -> ExtensionArray[_T, _Scalar, Array[_T, Scalar[_T]]]: ... + +class ExtensionScalar(Scalar[_T]): + value: Scalar[_T] + @staticmethod + def from_storage( + self, typ: BaseExtensionType[_T], value: object + ) -> ExtensionScalar[_T]: ... + +class ExtensionType(BaseExtensionType[_T]): + def __init__(self, storage_type: DataType[_T], extension_name: str) -> None: ... + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: DataType[_T], serialized + ) -> ExtensionType[_T]: ... + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + def __arrow_ext_serialize__(self) -> bytes: ... + def __eq__(self, other) -> bool: ... + +_Field = TypeVar("_Field", bound=Field) + +class Field(_Weakrefable, Generic[_T]): + metadata: dict + name: str + nullable: bool + type: DataType[_T] + def _export_to_c(self, out_ptr: int) -> None: ... + def _import_from_c(self, in_ptr: int) -> None: ... + def equals(self, other: Field, check_metadata: bool = ...) -> bool: ... + def flatten(self) -> list[Field]: ... + def remove_metadata(self: _Field) -> _Field: ... + def with_metadata(self: _Field, metadata: dict[str, str]) -> _Field: ... + def with_name(self: _Field, name: str) -> _Field: ... + def with_nullable(self: _Field, nullable) -> _Field: ... + def with_type(self, new_type: DataType[_T2]) -> Field[_T2]: ... + def __eq__(self, other) -> bool: ... + +class FixedSizeBinaryArray(Array[_T, FixedSizeBinaryScalar]): ... +class FixedSizeBinaryScalar(BinaryScalar[_T]): ... + +class FixedSizeBinaryType(DataType[_T]): + byte_width: int + +class FixedSizeBufferWriter(NativeFile): + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + def set_memcopy_threshold(self, threshold: int) -> None: ... + +_Values = TypeVar("_Values", bound=Array) + +class FixedSizeListArray(BaseListArray, Generic[_T, _Scalar, _Values]): + values: _Values + @overload + @staticmethod + def from_arrays( + values: _Values, type: DataType[_T] | None = ... + ) -> FixedSizeListArray[_T, Scalar[_T], _Values]: ... + @overload + @staticmethod + def from_arrays( + values: Array[_T, Scalar[_T]], list_size: int | None = ... + ) -> FixedSizeListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... + +class FixedSizeListScalar(ListScalar[_T]): ... + +class FixedSizeListType(DataType[list[_T]]): + list_size: int + value_field: Field[_T] + value_type: DataType[_T] + +class FloatArray(FloatingPointArray[FloatScalar]): ... +class FloatScalar(Scalar[float]): ... +class FloatingPointArray(NumericArray[float, _Scalar]): ... +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): ... +class HalfFloatScalar(Scalar[float]): ... +class IOBase(_io._IOBase): ... +class Int16Array(IntegerArray[Int16Scalar]): ... +class Int16Scalar(Scalar[int]): ... +class Int32Array(IntegerArray[Int32Scalar]): ... +class Int32Scalar(Scalar[int]): ... +class Int64Array(IntegerArray[Int64Scalar]): ... +class Int64Scalar(Scalar[int]): ... +class Int8Array(IntegerArray[Int8Scalar]): ... +class Int8Scalar(Scalar[int]): ... +class IntegerArray(NumericArray[int, _Scalar]): ... + +class IpcReadOptions(_Weakrefable): + ensure_native_endian: bool + included_fields: list | None + use_threads: bool + def __init__( + self, + *, + use_threads: bool = ..., + ensure_native_endian: bool = ..., + include_fields: list | None = ..., + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + allow_64bit: bool + compression: str | Codec | None + emit_dictionary_deltas: bool + metadata_version: MetadataVersion + unify_dictionaries: bool + use_legacy_format: bool + use_threads: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = ..., + allow_64bit: bool = ..., + use_legacy_format: bool = ..., + compression: str | Codec | None = ..., + use_threads: bool = ..., + emit_dictionary_details: bool = ..., + unify_dictionaries: bool = ..., + ) -> None: ... + +class KeyValueMetadata(_Metadata, collections.abc.Mapping): + def __init__(self, __arg0__: dict | None = ..., **kwargs) -> None: ... + def equals(self, other) -> bool: ... + def get_all(self, key: str) -> list: ... + def items(self) -> ItemsView[str, Any]: ... + def key(self, i: int) -> str: ... + def keys(self) -> KeysView[str]: ... + def to_dict(self) -> dict: ... + def value(self, i: int) -> Any: ... + def values(self) -> ValuesView[Any]: ... + def __contains__(self, other) -> bool: ... + def __eq__(self, other) -> bool: ... + def __getitem__(self, key: str) -> Any: ... + def __iter__(self) -> Generator[str, None, None]: ... + def __len__(self) -> int: ... + +class LargeBinaryArray(Array[bytes, LargeBinaryScalar]): + total_values_length: int + +class LargeBinaryScalar(BinaryScalar[bytes]): ... + +class LargeListArray(BaseListArray, Generic[_T, _Scalar, _Values]): + offsets: int + values: _Values + @staticmethod + @overload + def from_arrays( + offsets: Int64Array, + values: Array[_T, Scalar[_T]], + pool: MemoryPool | None = ..., + mask: bool | None = ..., + ) -> LargeListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... + @staticmethod + @overload + def from_arrays( + offsets: Int64Array, + values: _Array, + type: DataType[_T], + pool: MemoryPool | None = ..., + mask: bool | None = ..., + ) -> LargeListArray[_T, Scalar[_T], _Array]: ... + +class LargeListScalar(ListScalar[_T]): ... + +class LargeListType(DataType[list[_T]]): + value_field: Field[_T] + value_type: DataType[_T] + +class LargeStringArray(Array[str, LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = ..., + null_count: int = ..., + offset: int = ..., + ) -> LargeStringArray: ... + +class LargeStringScalar(StringScalar): ... + +class ListArray(BaseListArray, Generic[_T, _Scalar, _Values]): + offsets: int + values: _Values + + @staticmethod + @overload + def from_arrays( + offsets: Int32Array, + values: Array[_T, Scalar[_T]], + pool: MemoryPool | None = ..., + mask: bool | None = ..., + ) -> ListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... + @staticmethod + @overload + def from_arrays( + offsets: Int32Array, + values: _Array, + type: DataType[_T], + pool: MemoryPool | None = ..., + mask: bool | None = ..., + ) -> ListArray[_T, Scalar[_T], _Array]: ... + +class ListScalar(Scalar[list[_T]]): + values: list[_T] + +class ListType(DataType[list[_T]]): + value_field: Field[_T] + value_type: _T + +class LoggingMemoryPool(MemoryPool): ... + +_Key = TypeVar("_Key") +_Item = TypeVar("_Item") + +class MapArray( + ListArray[dict[_Key, _Item], MapScalar, StructArray], Generic[_Key, _Item] +): + items: Array[_Item, Scalar[_Item]] + keys: Array[_Key, Scalar[_Key]] + + @staticmethod + def from_arrays( # type: ignore + offsets: Int32Array, + keys: Array[_Key, Scalar[_Key]] | list[_Key], + items: Array[_Item, Scalar[_Item]] | list[_Item], + pool: MemoryPool | None = ..., + ) -> MapArray[_Key, _Item]: ... + +class MapScalar(ListScalar[dict[_Key, _Item]]): ... + +class MapType(DataType[dict[_Key, _Item]]): + item_field: Field[_Item] + item_type: DataType[_Item] + key_field: Field[_Key] + key_type: DataType[_Key] + +class MemoryMappedFile(NativeFile): + def _open(self, path: str, mode: Literal["r", "r+", "w"] = ...) -> Any: ... + @staticmethod + def create(path: str, size: int) -> MemoryMappedFile: ... + def fileno(self) -> int: ... + def resize(self, new_size: int) -> None: ... + +class MemoryPool(_Weakrefable): + backend_name: str + def bytes_allocated(self) -> int: ... + def max_memory(self) -> int: ... + def release_unused(self) -> None: ... + +class Message(_Weakrefable): + body: Any + metadata: Any + metadata_version: MetadataVersion + type: str + def equals(self, other: Message) -> bool: ... + def serialize( + self, alignment: int = ..., memory_pool: MemoryPool | None = ... + ) -> Any: ... + def serialize_to( + self, + sink: NativeFile, + alignment: int = ..., + memory_pool: MemoryPool | None = ..., + ) -> None: ... + +class MessageReader(_Weakrefable): + @staticmethod + def open_stream(source) -> MessageReader: ... + def read_next_message(self) -> Message: ... + def __iter__(self) -> Generator[Message, None, None]: ... + +class MetadataVersion(enum.IntEnum): + V1: ClassVar[importlib._bootstrap.MetadataVersion] = ... + V2: ClassVar[importlib._bootstrap.MetadataVersion] = ... + V3: ClassVar[importlib._bootstrap.MetadataVersion] = ... + V4: ClassVar[importlib._bootstrap.MetadataVersion] = ... + V5: ClassVar[importlib._bootstrap.MetadataVersion] = ... + +class MockOutputStream(NativeFile): + def size(self) -> int: ... + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + +class MonthDayNanoIntervalArray(Array[MonthDayNano, MonthDayNanoIntervalScalar]): ... + +class MonthDayNanoIntervalScalar(Scalar[MonthDayNano]): + value: MonthDayNano + +_NativeFile = TypeVar("_NativeFile", bound=NativeFile) + +class NativeFile(_Weakrefable): + _default_chunk_size: ClassVar[int] = ... + closed: bool + mode: Literal["rb", "wb", "rb+"] + def close(self) -> None: ... + def download( + self, stream_or_path: str | IOBase, buffer_size: int | None = ... + ) -> None: ... + def fileno(self) -> int: ... + def flush(self) -> None: ... + def get_stream(self: _NativeFile, file_offset: int, nbytes: int) -> _NativeFile: ... + def isatty(self) -> bool: ... + def metadata(self) -> dict: ... + def read(self, nbytes: int | None = ...) -> bytes: ... + def read1(self, nbytes: int | None = ...) -> bytes: ... + def read_at(self, nbytes: int, offset: int) -> bytes: ... + def read_buffer(self, nbytes: int | None = ...) -> Buffer: ... + def readable(self) -> bool: ... + def readall(self) -> bytes: ... + def readinto(self, b: Buffer | memoryview) -> int: ... + def readline(self, size: int = ...) -> bytes | None: ... + def readlines(self, hint: int = ...) -> list[bytes]: ... + def seek(self, position: int, whence: int = ...) -> None: ... + def seekable(self) -> bool: ... + def size(self) -> int: ... + def tell(self) -> int: ... + def truncate(self) -> None: ... + def upload(self, stream: IOBase, buffer_size: int = ...) -> None: ... + def writable(self) -> bool: ... + def write(self, data: bytes | memoryview | Buffer) -> int: ... + def writelines(self, lines: list[bytes]) -> None: ... + def __enter__(self: _NativeFile) -> _NativeFile: ... + def __exit__(self, exc_type, exc_value, tb) -> Any: ... + def __iter__(self: _NativeFile) -> _NativeFile: ... + def __next__(self) -> bytes: ... + +class NullArray(Array[None, NullScalar]): ... +class NullScalar(Scalar[None]): ... +class NumericArray(Array[_T, _Scalar]): ... +class OSFile(NativeFile): ... +class ProxyMemoryPool(MemoryPool): ... + +class PyExtensionType(ExtensionType[_T]): + def __init__(self, storage_type: DataType[_T]) -> None: ... + +class PythonFile(NativeFile): + def __init__( + self, handle: io.BytesIO, mode: Literal["rb", "wb", "rb+"] | None = ... + ) -> None: ... + +class ReadStats(importlib._bootstrap.ReadStats): ... + +class RecordBatch(_PandasConvertible): + columns: list[Array] + nbytes: int + num_columns: int + num_rows: int + schema: Schema + def __init__(self, *args, **kwargs) -> None: ... + def column(self, i: int) -> Array: ... + def drop_null(self: _Self) -> _Self: ... + def equals(self, other: RecordBatch, check_metadata: bool = ...) -> bool: ... + def field(self, i: int) -> Field: ... + def filter( + self: _Self, + mask: list[bool] | BooleanArray, + null_selection_behavior: Literal["drop", "emit_null"] = ..., + ) -> _Self: ... + @staticmethod + @overload + def from_arrays( + arrays: list[Array], + *, + names: list[str], + metadata: dict | None = ..., + ) -> RecordBatch: ... + @staticmethod + @overload + def from_arrays( + arrays: list[Array], + *, + schema: list[Schema], + metadata: dict | None = ..., + ) -> RecordBatch: ... + @overload + @staticmethod + def from_pandas( + df: pd.DataFrame, + *, + preserve_index: bool | None = ..., + nthreads: int | None = ..., + ) -> RecordBatch: ... + @overload + @staticmethod + def from_pandas( + df: pd.DataFrame, + *, + schema: Schema, + preserve_index: bool | None = ..., + nthreads: int | None = ..., + ) -> RecordBatch: ... + @overload + @staticmethod + def from_pandas( + df: pd.DataFrame, + *, + columns: list[str], + preserve_index: bool | None = ..., + nthreads: int | None = ..., + ) -> RecordBatch: ... + @staticmethod + def from_pydict( + mapping: dict[str, Array | list], + schema: Schema | None = ..., + metadata: dict | None = ..., + ) -> RecordBatch: ... + @staticmethod + def from_struct_array(struct_array: StructArray) -> RecordBatch: ... + def get_total_buffer_size(self) -> int: ... + def replace_schema_metadata(self: _Self, metadata: dict | None = ...) -> _Self: ... + def serialize(self, memory_pool: MemoryPool | None = ...) -> Buffer: ... + def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... + def take( + self: _Self, + indices: list[int] + | IntegerArray + | NDArray[np.signedinteger | np.unsignedinteger], + ) -> _Self: ... + def to_pydict(self) -> dict[str, list]: ... + def to_string(self, show_metadata: bool = ...) -> str: ... + def validate(self, *, full: bool = ...) -> None: ... + def __eq__(self, other) -> bool: ... + @overload + def __getitem__(self, key: str) -> Array: ... + @overload + def __getitem__(self: _Self, key: _builtin_slice) -> _Self: ... + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + +class RecordBatchReader(_Weakrefable): + schema: Schema + def __init__(cls, *args, **kwargs) -> None: ... + def _export_to_c(self, out_ptr: int) -> None: ... + @staticmethod + def _import_from_c(in_ptr: int) -> RecordBatchReader: ... + def close(self) -> None: ... + @staticmethod + def from_batches( + schema: Schema, batches: Iterable[RecordBatch] + ) -> RecordBatchReader: ... + def read_all(self) -> Table: ... + def read_next_batch(self) -> RecordBatch: ... + def read_pandas(self, **options) -> pd.DataFrame: ... + def __enter__(self: _Self) -> _Self: ... + def __exit__(self, exc_type, exc_val, exc_tb) -> None: ... + def __iter__(self) -> Generator[RecordBatch, None, None]: ... + +class ResizableBuffer(Buffer): + def resize(self, new_size: int, shrink_to_fit: bool = ...) -> None: ... + +class RuntimeInfo(NamedTuple): + detected_simd_level: str + simd_level: str + +class Scalar(_Weakrefable, Generic[_T]): + is_valid: bool + type: DataType[_T] + def __init__(self) -> None: ... + def as_py(self) -> _T: ... + @overload + def cast(self, target_type: Literal["bool", "boolean"]) -> BooleanScalar: ... + @overload + def cast(self, target_type: Literal["i1", "int8"]) -> Int8Scalar: ... + @overload + def cast(self, target_type: Literal["i2", "int16"]) -> Int16Scalar: ... + @overload + def cast(self, target_type: Literal["i4", "int32"]) -> Int32Scalar: ... + @overload + def cast(self, target_type: Literal["i8", "int64"]) -> Int64Scalar: ... + @overload + def cast(self, target_type: Literal["u1", "uint8"]) -> UInt8Scalar: ... + @overload + def cast(self, target_type: Literal["u2", "uint16"]) -> UInt16Scalar: ... + @overload + def cast(self, target_type: Literal["u4", "uint32"]) -> UInt32Scalar: ... + @overload + def cast(self, target_type: Literal["u8", "uint64"]) -> UInt64Scalar: ... + @overload + def cast( + self, target_type: Literal["f2", "halffloat", "float16"] + ) -> HalfFloatScalar: ... + @overload + def cast(self, target_type: Literal["f4", "float", "float32"]) -> FloatScalar: ... + @overload + def cast(self, target_type: Literal["f8", "double", "float64"]) -> DoubleScalar: ... + @overload + def cast(self, target_type: Literal["string", "str", "utf8"]) -> StringScalar: ... + @overload + def cast(self, target_type: Literal["binary"]) -> BinaryScalar: ... + @overload + def cast( + self, target_type: Literal["large_string", "large_str", "large_utf8"] + ) -> LargeStringScalar: ... + @overload + def cast(self, target_type: Literal["large_binary"]) -> LargeBinaryScalar: ... + @overload + def cast(self, target_type: Literal["date32", "date32[day]"]) -> Date32Scalar: ... + @overload + def cast(self, target_type: Literal["date64", "date64[ms]"]) -> Date64Scalar: ... + @overload + def cast(self, target_type: Literal["time32[s]", "time32[ms]"]) -> Time32Scalar: ... + @overload + def cast( + self, target_type: Literal["time64[us]", "time64[ns]"] + ) -> Time64Scalar: ... + @overload + def cast( + self, + target_type: Literal[ + "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" + ], + ) -> TimestampScalar: ... + @overload + def cast( + self, + target_type: Literal[ + "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" + ], + ) -> DurationScalar: ... + @overload + def cast( + self, + target_type: Literal["month_day_nano_interval"], + ) -> MonthDayNanoIntervalScalar: ... + @overload + def cast(self, target_type: DataType) -> Scalar: ... + def equals(self, other: Scalar) -> bool: ... + def __eq__(self, other) -> bool: ... + +class Schema(_Weakrefable): + metadata: dict[bytes, bytes] | None + names: list[str] + pandas_metadata: dict[str, Any] | None + types: list[DataType] + def _export_to_c(self, out_ptr: int) -> None: ... + def _field(self, i: int) -> Field: ... + @staticmethod + def _import_from_c(in_ptr: int) -> Schema: ... + def add_metadata( + self: _Self, metadata: dict[str | bytes, str | bytes] + ) -> _Self: ... + def append(self: _Self, field: Field) -> _Self: ... + def empty_table(self: _Self) -> _Self: ... + def equals(self, other: Schema, check_metadata: bool = ...) -> bool: ... + def field(self, i: int) -> Field: ... + def field_by_name(self, name: str) -> Field | None: ... + @classmethod + def from_pandas( + cls, df: pd.DataFrame, preserve_index: bool | None = ... + ) -> Schema: ... + def get_all_field_indices(self, name: str) -> list[int]: ... + def get_field_index(self, name: str) -> int: ... + def insert(self: _Self, i: int, field: Field) -> _Self: ... + def remove(self: _Self, i: int) -> _Self: ... + def remove_metadata(self: _Self) -> _Self: ... + def serialize(self, memory_pool: MemoryPool | None = ...) -> Buffer: ... + def set(self: _Self, i: int, field: Field) -> _Self: ... + def to_string( + self, + truncate_metadata: bool = ..., + show_field_metadata: bool = ..., + show_schema_metadata: bool = ..., + ) -> str: ... + def with_metadata( + self: _Self, metadata: dict[str | bytes, str | bytes] + ) -> _Self: ... + def __eq__(self, other) -> bool: ... + def __getitem__(self, key: int) -> Field: ... + def __iter__(self) -> Generator[Field, None, None]: ... + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + +class SerializationCallbackError(ArrowSerializationError): + def __init__(self, message: str, example_object) -> None: ... + +class SerializationContext(_Weakrefable): + def _deserialize_callback(self, serialized_obj: dict) -> Any: ... + def _serialize_callback(self, obj: Any) -> dict: ... + def clone(self: _Self) -> _Self: ... + def deserialize(self, what) -> Any: ... + def deserialize_components(self, what) -> Any: ... + def register_type( + self, + type_: type, + type_id: str, + pickle: bool = ..., + custom_serializer: Callable[[Any], bytes] | None = ..., + custom_deserializer: Callable[[bytes], Any] | None = ..., + ) -> Any: ... + def serialize(self, obj: Any) -> Any: ... + def serialize_to(self, value, sink) -> Any: ... + def set_pickle( + self, serializer: Callable[[Any], bytes], deserializer: Callable[[bytes], Any] + ) -> None: ... + +class SerializedPyObject(_Weakrefable): + base: Any + total_bytes: int + + def deserialize(self, context: SerializationContext | None = ...) -> Any: ... + @staticmethod + def from_components(components: dict[str, Any]) -> SerializedPyObject: ... + def to_buffer(self, nthreads: int = ...) -> Buffer: ... + def to_components(self, memory_pool: MemoryPool | None = ...) -> dict[str, Any]: ... + def write_to(self, sink) -> Any: ... + +class SignalStopHandler: + stop_token: StopToken + def _init_signals(self) -> Any: ... + def __enter__(self: _Self) -> _Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + +class SparseCOOTensor(_Weakrefable, Generic[_T]): + dim_names: tuple[str, ...] + has_canonical_format: bool + is_mutable: bool + ndim: int + non_zero_length: int + shape: tuple[int, ...] + size: int + type: DataType[_T] + def dim_name(self, i: int) -> str: ... + def equals(self, other: SparseCOOTensor) -> bool: ... + @classmethod + def from_dense_numpy( + cls, obj: NDArray, dim_names: list[str] | None = ... + ) -> SparseCOOTensor: ... + @staticmethod + def from_numpy( + data: NDArray, coords: NDArray, shape: tuple, dim_names: list[str] | None = ... + ) -> SparseCOOTensor: ... + @staticmethod + def from_pydata_sparse( + obj, dim_names: list[str] | None = ... + ) -> SparseCOOTensor: ... + @staticmethod + def from_scipy(obj, dim_names: list[str] | None = ...) -> SparseCOOTensor: ... + @staticmethod + def from_tensor(self, obj: Tensor[_T]) -> SparseCOOTensor[_T]: ... + def to_numpy(self) -> NDArray: ... + def to_pydata_sparse(self) -> Any: ... + def to_scipy(self) -> Any: ... + def to_tensor(self) -> Tensor[_T]: ... + def __eq__(self, other) -> bool: ... + +class SparseCSCMatrix(_Weakrefable, Generic[_T]): + dim_names: tuple[str, ...] + is_mutable: bool + ndim: int + non_zero_length: int + shape: tuple[int, ...] + size: int + type: DataType[_T] + def dim_name(self, i: int) -> str: ... + def equals(self, other: SparseCSCMatrix) -> bool: ... + @classmethod + def from_dense_numpy( + cls, obj: NDArray, dim_names: list[str] | None = ... + ) -> SparseCSCMatrix: ... + @staticmethod + def from_numpy( + data: NDArray, + indptr: NDArray, + indices: NDArray, + shape: tuple[int, ...], + dim_names: list[str] | None = ..., + ) -> SparseCSCMatrix: ... + def from_scipy(self, obj, dim_names: list[str] | None = ...) -> SparseCSCMatrix: ... + def from_tensor(self, obj: Tensor[_T]) -> SparseCSCMatrix[_T]: ... + def to_numpy(self) -> NDArray: ... + def to_scipy(self) -> Any: ... + def to_tensor(self) -> Tensor[_T]: ... + def __eq__(self, other) -> bool: ... + +class SparseCSFTensor(_Weakrefable, Generic[_T]): + dim_names: tuple[str, ...] + is_mutable: bool + ndim: int + non_zero_length: int + shape: tuple[int, ...] + size: int + type: DataType[_T] + def dim_name(self, i: int) -> str: ... + def equals(self, other: SparseCSFTensor) -> bool: ... + @staticmethod + def from_dense_numpy( + obj: NDArray, dim_names: list[str] | None = ... + ) -> SparseCSFTensor: ... + @staticmethod + def from_numpy( + data: NDArray, + indptr: NDArray, + indices: NDArray, + shape: tuple[int, ...], + axis_order: list[str] | None = ..., + dim_names=..., + ) -> SparseCSFTensor: ... + @staticmethod + def from_tensor(obj: Tensor[_T]) -> SparseCSFTensor[_T]: ... + def to_numpy(self) -> NDArray: ... + def to_tensor(self) -> Tensor[_T]: ... + def __eq__(self, other) -> bool: ... + +class SparseCSRMatrix(_Weakrefable, Generic[_T]): + dim_names: tuple[str, ...] + is_mutable: bool + ndim: int + non_zero_length: int + shape: tuple[int, ...] + size: int + type: DataType[_T] + def dim_name(self, i: int) -> str: ... + def equals(self, other: SparseCSRMatrix) -> bool: ... + @classmethod + def from_dense_numpy( + cls, obj: NDArray, dim_names: list[str] | None = ... + ) -> SparseCSRMatrix: ... + @staticmethod + def from_numpy( + data: NDArray, + indptr: NDArray, + indices: NDArray, + shape: tuple[int, ...], + dim_names: list[str] | None = ..., + ) -> SparseCSRMatrix: ... + def from_scipy(self, obj, dim_names: list[str] | None = ...) -> SparseCSRMatrix: ... + def from_tensor(self, obj: Tensor[_T]) -> SparseCSRMatrix[_T]: ... + def to_numpy(self) -> NDArray: ... + def to_scipy(self) -> Any: ... + def to_tensor(self) -> Tensor[_T]: ... + def __eq__(self, other) -> bool: ... + +class SparseUnionType(UnionType): ... +class StopToken: ... + +class StringArray(Array[str, StringScalar]): + @staticmethod + def from_buffers( # type: ignore + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = ..., + null_count: int = ..., + offset: int = ..., + ) -> StringArray: ... + +class StringBuilder(_Weakrefable): + null_count: int + def __init__(self, memory_pool: MemoryPool | None = ...) -> None: ... + def append(self, value: str | bytes) -> None: ... + def append_values(self, values: list[str | bytes]) -> None: ... + def finish(self) -> StringArray: ... + def __len__(self) -> int: ... + +class StringScalar(BinaryScalar[str]): ... + +class StructArray(Array[dict, StructScalar]): + def field(self, index: int | str) -> Int64Array: ... + def flatten(self, memory_pool: MemoryPool | None = ...) -> list[Array]: ... + @staticmethod + def from_arrays( + arrays: Array, + names: list[str] | None = ..., + fields: list[Field] | None = ..., + mask: BooleanArray | None = ..., + memory_pool: MemoryPool | None = ..., + ) -> StructArray: ... + +class StructScalar(Scalar, collections.abc.Mapping): + def _as_py_tuple(self) -> Any: ... + def as_py(self) -> dict: ... + def items(self) -> ItemsView[str, Any]: ... + def __contains__(self, other) -> bool: ... + def __getitem__(self, index) -> Scalar: ... + def __iter__(self) -> Generator[str, None, None]: ... + def __len__(self) -> int: ... + +class StructType(DataType): + def field(self, i: int) -> Field: ... + def get_all_field_indices(self, name: str) -> list[int]: ... + def get_field_index(self, name: str) -> int: ... + def __getitem__(self, index) -> Field: ... + def __iter__(self) -> Generator[Field, None, None]: ... + def __len__(self) -> int: ... + +class Table(_PandasConvertible): + column_names: list[str] + columns: list[Array] + nbytes: int + num_columns: int + num_rows: int + schema: Schema + shape: tuple[int, ...] + def _column(self, i: int) -> Any: ... + def _ensure_integer_index(self, i) -> Any: ... + def _to_pandas( + self, options, categories=..., ignore_metadata=..., types_mapper=... + ) -> Any: ... + def add_column( + self: _Self, i: int, field_: str | Field, column: Array + ) -> _Self: ... + def append_column(self: _Self, field_: str | Field, column: Array) -> _Self: ... + def cast( + self, + target_schema: Schema, + safe: bool | None = ..., + options: CastOptions | None = ..., + ) -> Table: ... + def column(self, i: int | str) -> ChunkedArray: ... + def combine_chunks(self: _Self, memory_pool: MemoryPool | None = ...) -> _Self: ... + def drop_null(self: _Self) -> _Self: ... + def equals(self, other: Table, check_metadata: bool = ...) -> bool: ... + def field(self, i: int) -> Field: ... + def filter( + self: _Self, + mask: list[bool] | BooleanArray, + null_selection_behavior: Literal["drop", "emit_null"] = ..., + ) -> _Self: ... + def flatten(self, memory_pool: MemoryPool | None = ...) -> Table: ... + def from_arrays(self, arrays, names=..., schema=..., metadata=...) -> Any: ... + def from_batches(self, batches, Schemaschema=...) -> Any: ... + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = ..., + preserve_index: bool | None = ..., + nthreads: int | None = ..., + columns: list[str] | None = ..., + safe: bool = ..., + ) -> Table: ... + @staticmethod + def from_pydict( + mapping: dict, + schema: Schema | None = ..., + metadata: dict[str | bytes, str | bytes] | None = ..., + ) -> Table: ... + @staticmethod + def from_pylist( + mapping: list[dict], + schema: Schema | None = ..., + metadata: dict[str | bytes, str | bytes] | None = ..., + ) -> Table: ... + def get_total_buffer_size(self) -> int: ... + def group_by(self, keys: list[str]) -> TableGroupBy: ... + def itercolumns(self) -> Generator[ChunkedArray, None, None]: ... + def join( + self, + right_table: Table, + keys: str | list[str], + right_keys: str | list[str] | None = ..., + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ] = ..., + left_suffix: str | None = ..., + right_suffix: str | None = ..., + coalesce_keys: bool = ..., + use_threads: bool = ..., + ) -> Table: ... + def remove_column(self: _Self, i: int) -> _Self: ... + def replace_schema_metadata( + self: _Self, metadata: dict[str | bytes, str | bytes] | None = ... + ) -> _Self: ... + def select(self, columns: list[str]) -> Table: ... + def set_column( + self: _Self, i: int, field_: str | Field, column: Array + ) -> _Self: ... + def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... + def sort_by( + self, + sorting: Literal["ascending", "descending"] + | list[tuple[str, Literal["ascending", "descending"]]], + ) -> Table: ... + def take( + self: _Self, + indices: list[int] + | IntegerArray + | NDArray[np.signedinteger | np.unsignedinteger], + ) -> _Self: ... + def to_batches(self, max_chunksize: int | None = ...) -> list[RecordBatch]: ... + def to_pylist(self) -> list[dict]: ... + def to_reader(self, max_chunksize: int | None = ...) -> RecordBatchReader: ... + def to_string( + self, *, show_metadata: bool = ..., preview_cols: int = ... + ) -> str: ... + def unify_dictionaries( + self: _Self, memory_pool: MemoryPool | None = ... + ) -> _Self: ... + def validate(self, *, full: bool = ...) -> None: ... + def __eq__(self, other) -> bool: ... + @overload + def __getitem__(self, index: int | str) -> ChunkedArray: ... + @overload + def __getitem__(self, index: _builtin_slice) -> Table: ... + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + +class TableGroupBy: + def __init__(self, table: Table, keys: str | list[str]) -> None: ... + def aggregate( + self, aggregations: list[tuple[str, str] | tuple[str, str, FunctionOptions]] + ) -> Table: ... + +class Tensor(_Weakrefable, Generic[_T]): + dim_names: list[str] + is_contiguous: bool + is_mutable: bool + ndim: int + shape: tuple[int, ...] + size: int + strides: tuple[int, ...] + type: DataType[_T] + def dim_name(self, i: int) -> str: ... + def equals(self, other: Tensor) -> bool: ... + @staticmethod + def from_numpy(obj: NDArray, dim_names: list[str] | None = ...) -> Tensor: ... + def to_numpy(self) -> NDArray: ... + def __eq__(self, other) -> bool: ... + +class TextIOBase(_io._TextIOBase, io.IOBase): ... +class Time32Array(NumericArray[dt.time, Time32Scalar]): ... +class Time32Scalar(Scalar[dt.time]): ... + +class Time32Type(DataType[dt.time]): + unit: str + +class Time64Array(NumericArray[dt.time, Time64Scalar]): ... +class Time64Scalar(Scalar[dt.time]): ... + +class Time64Type(DataType[dt.time]): + unit: Any + +class TimestampArray(NumericArray[dt.datetime, TimestampScalar]): ... + +class TimestampScalar(Scalar[dt.datetime]): + value: int + +class TimestampType(DataType[dt.datetime]): + tz: Any + unit: str + def to_pandas_dtype(self) -> DTypeLike: ... + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf) -> Any: ... + +class TransformInputStream(NativeFile): ... +class UInt16Array(IntegerArray[UInt16Scalar]): ... +class UInt16Scalar(Scalar[int]): ... +class UInt32Array(IntegerArray[UInt32Scalar]): ... +class UInt32Scalar(Scalar[int]): ... +class UInt64Array(IntegerArray[UInt64Scalar]): ... +class UInt64Scalar(Scalar[int]): ... +class UInt8Array(IntegerArray[UInt8Scalar]): ... +class UInt8Scalar(Scalar[int]): ... + +class UnionArray(Array[Any, UnionScalar]): + offsets: Int32Array + type_codes: Int8Array + def child(self, pos: int) -> Array: ... + def field(self, pos: int) -> Array: ... + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: list, + field_names: list[str] | None = ..., + type_codes: list | None = ..., + ) -> UnionArray: ... + @staticmethod + def from_sparse( + types: Int8Array, + children: list, + field_names: list[str] | None = ..., + type_codes: list | None = ..., + ) -> UnionArray: ... + +class UnionScalar(Scalar): + type_code: Any + value: Any + +class UnionType(DataType): + mode: Any + type_codes: Any + def field(self, i) -> Field: ... + def __getitem__(self, index) -> Any: ... + def __iter__(self) -> Any: ... + def __len__(self) -> int: ... + +class UnknownExtensionType(PyExtensionType): + def __arrow_ext_serialize__(self) -> Any: ... + +class UnsupportedOperation(OSError, ValueError): ... + +class VersionInfo(NamedTuple): + major: str + minor: str + patch: str + +class WriteStats(importlib._bootstrap.WriteStats): + __slots__: ClassVar[tuple] = ... + +class _CRecordBatchWriter(_Weakrefable): + stats: Any + def close(self) -> None: ... + def write(self, table_or_batch: RecordBatch | Table) -> None: ... + def write_batch(self, batch: RecordBatch) -> None: ... + def write_table(self, table: Table, max_chunksize: int | None = ...) -> None: ... + def __enter__(self) -> _CRecordBatchWriter: ... + def __exit__(self, exc_type, exc_val, exc_tb) -> Any: ... + +class _ExtensionRegistryNanny(_Weakrefable): + def release_registry(self) -> None: ... + +class _Metadata(_Weakrefable): ... + +class _PandasAPIShim: + _array_like_types: Any + _categorical_type: Any + _compat_module: Any + _data_frame: Any + _datetimetz_type: Any + _extension_array: Any + _extension_dtype: Any + _index: Any + _is_extension_array_dtype: Any + _loose_version: Any + _pd: Any + _pd024: Any + _series: Any + _types_api: Any + _version: Any + categorical_type: Any + compat: Any + datetimetz_type: Any + extension_dtype: Any + has_sparse: Any + have_pandas: Any + loose_version: Any + pd: Any + version: Any + def __init__(self) -> None: ... + def assert_frame_equal(self, *args, **kwargs) -> Any: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + def get_rangeindex_attribute(self, level, name) -> Any: ... + def get_values(self, obj) -> Any: ... + def infer_dtype(self, obj) -> Any: ... + def is_array_like(self, obj) -> bool: ... + def is_categorical(self, obj) -> TypeGuard[pd.Categorical]: ... + def is_data_frame(self, obj) -> TypeGuard[pd.DataFrame]: ... + def is_datetimetz(self, obj) -> bool: ... + def is_extension_array_dtype(self, obj) -> bool: ... + def is_index(self, obj) -> TypeGuard[pd.Index]: ... + def is_series(self, obj) -> TypeGuard[pd.Series]: ... + def is_sparse(self, obj) -> bool: ... + def pandas_dtype(self, dtype: DTypeLike) -> DTypeLike: ... + def series(self, *args, **kwargs) -> pd.Series: ... + +class _PandasConvertible(_Weakrefable): + def to_pandas( + self, + memory_pool: MemoryPool | None = ..., + categories: list[pd.Categorical] | None = ..., + strings_to_categorical: bool | None = ..., + zero_copy_only: bool | None = ..., + integer_object_nulls: bool | None = ..., + date_as_object: bool | None = ..., + timestamp_as_object: bool | None = ..., + use_threads: bool | None = ..., + deduplicate_objects: bool | None = ..., + ignore_metadata: bool | None = ..., + safe: bool | None = ..., + split_blocks: bool | None = ..., + self_destruct: bool | None = ..., + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] + | None = ..., + ) -> pd.Series | pd.DataFrame: ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> Any: ... + +class _ReadStats(NamedTuple): + num_dictionary_batches: int + num_dictionary_deltas: int + num_messages: int + num_record_batches: int + num_replaced_dictionaries: int + +class _RecordBatchFileReader(_Weakrefable): + num_record_batches: Any + schema: Any + stats: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _open( + self, + source, + footer_offset=..., + IpcReadOptionsoptions=..., + MemoryPoolmemory_pool=..., + ) -> Any: ... + def get_batch(self, inti) -> Any: ... + def get_record_batch(self, *args, **kwargs) -> Any: ... + def read_all(self) -> Any: ... + def read_pandas(self, **options) -> Any: ... + def __enter__(self) -> Any: ... + def __exit__(self, exc_type, exc_value, traceback) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _RecordBatchStreamReader(RecordBatchReader): + stats: Any + def _open( + self, source, IpcReadOptionsoptions=..., MemoryPoolmemory_pool=... + ) -> Any: ... + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + _metadata_version: Any + _use_legacy_format: Any + @classmethod + def __init__(cls, *args, **kwargs) -> None: ... + def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... + def __reduce__(self) -> Any: ... + def __setstate__(self, state) -> Any: ... + +class _Weakrefable: ... + +class _WriteStats(NamedTuple): + num_dictionary_batches: int + num_dictionary_deltas: int + num_messages: int + num_record_batches: int + num_replaced_dictionaries: int + +class ordered_dict: + def __init__(self, *args, **kwargs) -> None: ... + def clear(self, *args, **kwargs) -> Any: ... + def copy(self) -> ashallowcopyofD: ... + @classmethod + def fromkeys(cls, *args, **kwargs) -> Any: ... + def get(self, *args, **kwargs) -> Any: ... + def items(self, *args, **kwargs) -> Any: ... + def keys(self, *args, **kwargs) -> Any: ... + def pop(self, *args, **kwargs) -> Any: ... + def popitem(self, *args, **kwargs) -> Any: ... + def setdefault(self, *args, **kwargs) -> Any: ... + def update(self, *args, **kwargs) -> Any: ... + def values(self, *args, **kwargs) -> Any: ... + @classmethod + def __class_getitem__(cls, *args, **kwargs) -> Any: ... + def __contains__(self, other) -> Any: ... + def __delitem__(self, other) -> Any: ... + def __eq__(self, other) -> Any: ... + def __ge__(self, other) -> Any: ... + def __getitem__(self, y) -> Any: ... + def __gt__(self, other) -> Any: ... + def __ior__(self, other) -> Any: ... + def __iter__(self) -> Any: ... + def __le__(self, other) -> Any: ... + def __len__(self) -> Any: ... + def __lt__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + def __or__(self, other) -> Any: ... + def __reversed__(self) -> Any: ... + def __ror__(self, other) -> Any: ... + def __setitem__(self, index, object) -> Any: ... + def __sizeof__(self) -> Any: ... + +def __pyx_unpickle_SerializationContext( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle__PandasAPIShim( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle__PandasConvertible( + __pyx_type, long__pyx_checksum, __pyx_state +) -> Any: ... +def __pyx_unpickle___Pyx_EnumMeta(*args, **kwargs) -> Any: ... +def _datetime_from_int(int64_tvalue, TimeUnitunit, tzinfo=...) -> Any: ... +def _deprecate_serialization(name) -> Any: ... +def _deserialize(obj, SerializationContextcontext=...) -> Any: ... +def _detect_compression(path) -> Any: ... +def _empty_array(DataTypetype) -> Any: ... +def _from_pydict(cls, mapping, schema, metadata) -> Any: ... +def _from_pylist(cls, mapping, schema, metadata) -> Any: ... +def _gdb_test_session() -> Any: ... +def _get_default_context() -> Any: ... +def _handle_arrow_array_protocol(obj, type, mask, size) -> Any: ... +def _is_primitive(Typetype) -> Any: ... +def _ndarray_to_arrow_type(values, DataTypetype) -> Any: ... +def _normalize_slice(arrow_obj, slicekey) -> Any: ... +def _pc() -> Any: ... +def _read_serialized(source, base=...) -> Any: ... +def _reconstruct_record_batch(columns, schema) -> Any: ... +def _reconstruct_table(arrays, schema) -> Any: ... +def _register_py_extension_type() -> Any: ... +def _restore_array(data) -> Any: ... +def _serialize(value, SerializationContextcontext=...) -> Any: ... +def _unregister_py_extension_types() -> Any: ... +@overload +def allocate_buffer( + size: int, + memory_pool: MemoryPool | None = ..., +) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None = ..., *, resizable: Literal[True] +) -> ResizableBuffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None = ..., *, resizable: Literal[False] +) -> Buffer: ... +def array( + obj: Iterable | NDArray | pd.Series, + type: DataType | None = ..., + mask: list[bool] | BooleanArray | None = ..., + size: int | None = ..., + from_pandas: bool | None = ..., + safe: bool = ..., + memory_pool: MemoryPool | None = ..., +) -> Array | ChunkedArray: ... +def as_buffer(o) -> Buffer: ... +def asarray(values: Iterable, type: DataType | None = ...) -> Array: ... +def benchmark_PandasObjectIsNull(obj: list) -> Any: ... +def binary(length: int = ...) -> DataType[bytes]: ... +def bool_() -> DataType[bool]: ... +@overload +def chunked_array(arrays: Array[_T, _Scalar]) -> ChunkedArray[_T, _Scalar]: ... +@overload +def chunked_array( + arrays: Array, type: DataType[_T] +) -> ChunkedArray[_T, Scalar[_T]]: ... +@overload +def compress( + buf: Buffer | bytes | memoryview, + codec: str = ..., + *, + memory_pool: MemoryPool | None = ..., +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | memoryview, + codec: str = ..., + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = ..., +) -> bytes: ... +def concat_arrays( + arrays: list[_Array], memory_pool: MemoryPool | None = ... +) -> _Array: ... +def concat_tables( + tables: list[Table], promote: bool = ..., memory_pool: MemoryPool | None = ... +) -> Table: ... +def cpu_count() -> int: ... +def create_memory_map(path: str | PathLike, size: int) -> MemoryMappedFile: ... +def date32() -> DataType[dt.date]: ... +def date64() -> DataType[dt.date]: ... +def decimal128(precision: int, scale: int | None = ...) -> DataType[Decimal]: ... +def decimal256(precision: int, scale: int | None = ...) -> DataType[Decimal]: ... +@overload +def decompress( + buf: Buffer | bytes | memoryview, + decompressed_size: int | None = ..., + codec: str = ..., + *, + memory_pool: MemoryPool | None = ..., +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | memoryview, + decompressed_size: int | None = ..., + codec: str = ..., + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = ..., +) -> bytes: ... +def default_memory_pool() -> MemoryPool: ... +def dense_union( + child_fields: list[Field], type_codes: list[int] | None = ... +) -> DenseUnionType: ... +def deserialize(obj, context: SerializationContext = ...) -> object: ... +def deserialize_components( + components: dict, context: SerializationContext = ... +) -> object: ... +def deserialize_from( + source: NativeFile, base: object, context: SerializationContext = ... +) -> object: ... +def dictionary( + index_type: DataType, value_type: DataType, ordered: bool = ... +) -> DictionaryType: ... +def duration(unit: Literal["s", "ms", "us", "ns"]) -> DurationType: ... +def enable_signal_handlers(enable: bool) -> None: ... +def encode_file_path(path: str) -> bytes: ... +def ensure_metadata(meta: dict, allow_none: bool = ...) -> KeyValueMetadata: ... +def ensure_type(ty: DataType, allow_none=...) -> DataType: ... +def field( + name: str | bytes, + type: DataType[_T], + nullable: bool = ..., + metadata: dict | None = ..., +) -> Field[_T]: ... +def float16() -> DataType[float]: ... +def float32() -> DataType[float]: ... +def float64() -> DataType[float]: ... +def foreign_buffer(address: int, size: int, base: object | None = ...) -> None: ... +def from_numpy_dtype(dtype: DTypeLike) -> DataType: ... +def frombytes(o: bytes, *, safe: bool = ...) -> str: ... +def get_record_batch_size(batch: RecordBatch) -> int: ... +def get_tensor_size(tensor: Tensor) -> int: ... +def infer_type( + values: Iterable, mask: list[bool] | BooleanArray = ..., from_pandas: bool = ... +) -> DataType: ... +def input_stream( + source: str | PathLike | Buffer | IOBase, + compression: str | None = ..., + buffer_size: int | None = ..., +) -> NativeFile: ... +def int16() -> DataType[int]: ... +def int32() -> DataType[int]: ... +def int64() -> DataType[int]: ... +def int8() -> DataType[int]: ... +def io_thread_count() -> int: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_named_tuple(cls: Any) -> bool: ... +def jemalloc_memory_pool() -> Any: ... +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... +def large_binary() -> DataType[bytes]: ... +def large_list(value_type: DataType[_T] | Field[_T]) -> LargeListType[_T]: ... +def large_string() -> DataType[str]: ... +def large_utf8() -> DataType[str]: ... +def list_( + value_type: DataType[_T] | Field[_T], list_size: int = ... +) -> ListType[_T]: ... +def log_memory_allocations(enable: bool = ...) -> None: ... +def logging_memory_pool(parent: MemoryPool) -> MemoryPool: ... +def map_( + key_type: DataType[_Key], item_type: DataType[_Item], keys_sorted: bool = ... +) -> MapType[_Key, _Item]: ... +def memory_map(path: str, mode: Literal["r", "r+", "w"] = ...) -> MemoryMappedFile: ... +def mimalloc_memory_pool() -> MemoryPool: ... +def month_day_nano_interval() -> DataType[MonthDayNano]: ... +def null() -> DataType[None]: ... +def nulls( + size: int, type: DataType[_T] = ..., memory_pool: MemoryPool | None = ... +) -> Array[_T, Scalar[_T]]: ... +def output_stream( + source: str | PathLike | Buffer | IOBase, + compression: str | None = ..., + buffer_size: int = ..., +) -> NativeFile: ... +def proxy_memory_pool(parent: MemoryPool) -> MemoryPool: ... +def py_buffer(obj: bytes | memoryview) -> Buffer: ... +def read_message(source: NativeFile | IOBase | Buffer) -> Message: ... +def read_record_batch( + obj: Message | Buffer | memoryview, + schema: Schema, + dictionary_memo: DictionaryMemo | None = ..., +) -> RecordBatch: ... +def read_schema( + obj: Buffer | Message | memoryview, dictionary_memo: DictionaryMemo | None = ... +) -> Schema: ... +def read_serialized(source: NativeFile, base: object | None = ...) -> object: ... +def read_tensor(source: NativeFile) -> Tensor: ... +@overload +def record_batch( + data: pd.DataFrame, + schema: Schema | None = ..., + metadata: dict | None = ..., +) -> RecordBatch: ... +@overload +def record_batch( + data: list[Array | ChunkedArray], + names: list[str], + metadata: dict | None = ..., +) -> RecordBatch: ... +@overload +def record_batch( + data: list[Array | ChunkedArray], + schema: Schema, + metadata: dict | None = ..., +) -> RecordBatch: ... +def register_extension_type(ext_type: BaseExtensionType) -> None: ... +def repeat(value, size: int, memory_pool: MemoryPool | None = ...) -> Array: ... +def runtime_info() -> RuntimeInfo: ... +def scalar( + value: Any, + type: DataType[_T], + *, + from_pandas: bool | None = ..., + memory_pool: MemoryPool | None = ..., +) -> Scalar[_T]: ... +def schema(fields: Iterable[Field], metadata: dict | None = ...) -> Schema: ... +def serialize(value: object, context: SerializationContext | None = ...) -> object: ... +def serialize_to( + value: object, sink: NativeFile | IOBase, context: SerializationContext | None = ... +) -> None: ... +def set_cpu_count(count: int) -> None: ... +def set_io_thread_count(count: int) -> None: ... +def set_memory_pool(pool: MemoryPool) -> None: ... +def sparse_union( + child_fields: Iterable[Field], type_codes: list[int] = ... +) -> SparseUnionType: ... +def string() -> DataType[str]: ... +def string_to_tzinfo(name: str) -> dt.tzinfo: ... +def struct(fields: Iterable[Field]) -> StructType: ... +def supported_memory_backends() -> list[str]: ... +def system_memory_pool() -> MemoryPool: ... +@overload +def table( + df: pd.DataFrame, schema: Schema | None = ..., nthreads: int | None = ... +) -> Table: ... +@overload +def table( + arrays: list[Array], + schema: Schema, + metadata: dict | None = ..., + nthreads: int | None = ..., +) -> Table: ... +@overload +def table( + arrays: list[Array], + names: list[str], + metadata: dict | None = ..., + nthreads: int | None = ..., +) -> Table: ... +def table_to_blocks( + options: dict, table: Table, categories: list[str], extension_columns: list[str] +) -> list[dict]: ... +def time32(unit: Literal["s", "ms"]) -> DataType[dt.time]: ... +def time64(unit: Literal["us", "ns"]) -> DataType[dt.time]: ... +def timestamp(unit, tz=...) -> Any: ... +def tobytes(o: str | bytes) -> bytes: ... +def total_allocated_bytes() -> int: ... +def transcoding_input_stream(stream, src_encoding, dest_encoding) -> Any: ... +def type_for_alias(name: str) -> DataType: ... +def tzinfo_to_string(tz: dt.tzinfo) -> str: ... +def uint16() -> DataType[int]: ... +def uint32() -> DataType[int]: ... +def uint64() -> DataType[int]: ... +def uint8() -> DataType[int]: ... +def unify_schemas(schemas: list[Schema]) -> Schema: ... +def union( + child_fields: Iterable[Field], + mode: Literal["sparse", "dense"], + type_codes: list[int] | None = ..., +) -> UnionType: ... +def unregister_extension_type(type_name: str) -> None: ... +def utf8() -> DataType[str]: ... +def write_tensor(tensor: Tensor, dest: NativeFile) -> None: ... diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi new file mode 100644 index 00000000000..b68e85ef179 --- /dev/null +++ b/pyarrow-stubs/orc.pyi @@ -0,0 +1,99 @@ +from io import IOBase + +from _fs import FileSystem +from pyarrow import _orc +from pyarrow.lib import ( + KeyValueMetadata, + NativeFile, + RecordBatch, + Schema, + Table, +) + +class ORCFile: + reader: _orc.ORCReader + def __init__(self, source: str | NativeFile | IOBase) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: ... + @property + def schema(self) -> Schema: ... + @property + def nrows(self) -> int: ... + @property + def nstripes(self) -> int: ... + @property + def file_version(self) -> str: ... + @property + def software_version(self) -> str: ... + @property + def compression(self) -> str: ... + @property + def compression_size(self) -> int: ... + @property + def writer(self) -> str | int: ... + @property + def writer_version(self) -> str: ... + @property + def row_index_stride(self) -> int: ... + @property + def nstripe_statistics(self) -> int: ... + @property + def content_length(self) -> int: ... + @property + def stripe_statistics_length(self) -> int: ... + @property + def file_footer_length(self) -> int: ... + @property + def file_postscript_length(self) -> int: ... + @property + def file_length(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = ...) -> RecordBatch: ... + def read(self, columns: list[str] | None = ...) -> Table: ... + +class ORCWriter: + __doc__: str + is_open: bool + writer: _orc.ORCWriter + def __init__( + self, + where: str | NativeFile | IOBase, + *, + file_version: str = ..., + batch_size: int = ..., + stripe_size: int = ..., + compression: str = ..., + compression_block_size: int = ..., + compression_strategy: str = ..., + row_index_stride: int = ..., + padding_tolerance: float = ..., + dictionary_key_size_threshold: float = ..., + bloom_filter_columns: list[str] | None = ..., + bloom_filter_fpp: float = ..., + ) -> None: ... + def __del__(self) -> None: ... + def __enter__(self) -> ORCWriter: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... + +def read_table( + source: str | NativeFile | IOBase, + columns: list[str] | None = ..., + filesystem: str | FileSystem | None = ..., +) -> Table: ... +def write_table( + table: Table, + where: str | NativeFile | IOBase, + *, + file_version: str = ..., + batch_size: int = ..., + stripe_size: int = ..., + compression: str = ..., + compression_block_size: int = ..., + compression_strategy: str = ..., + row_index_stride: int = ..., + padding_tolerance: float = ..., + dictionary_key_size_threshold: float = ..., + bloom_filter_columns: list[str] | None = ..., + bloom_filter_fpp: float = ..., +) -> None: ... diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi new file mode 100644 index 00000000000..b7de1fe6b9f --- /dev/null +++ b/pyarrow-stubs/pandas_compat.pyi @@ -0,0 +1,65 @@ +from typing import ( + Any, + Callable, + TypedDict, +) + +import numpy as np +import pandas as pd +from pandas.core.internals import BlockManager +from pyarrow.lib import ( + Array, + DataType, + Schema, + Table, + _ArrowType, + frombytes as frombytes, +) + +class _SerializedDict(TypedDict): + blocks: list[Any] + axes: list[Any] + +def get_logical_type_map() -> dict[_ArrowType, str]: ... +def get_logical_type(arrow_type: _ArrowType) -> str: ... +def get_logical_type_from_numpy(pandas_collection: pd.Series | pd.Index) -> str: ... +def get_extension_dtype_info( + column: pd.Series | pd.Index, +) -> tuple[str, dict[str, Any] | None]: ... +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> dict[str, Any]: ... +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], +) -> dict[bytes, bytes]: ... +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool, columns: list[str] | None = ... +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool, + nthreads: int = ..., + columns: list[str] | None = ..., + safe: bool = ..., +) -> tuple[Array, Schema, int | None]: ... +def get_datetimetz_type( + values: pd.Series | pd.Index, dtype: np.dtype, type_: DataType | None +) -> tuple[pd.Series | pd.Index, DataType]: ... +def dataframe_to_serialized_dict(frame: pd.DataFrame) -> _SerializedDict: ... +def serialized_dict_to_dataframe(data: _SerializedDict) -> pd.DataFrame: ... +def make_datetimetz(tz: str) -> pd.DatetimeTZDtype: ... +def table_to_blockmanager( + options: dict, + table: Table, + categories: list[str] | None = ..., + ignore_metadata: bool = ..., + types_mapper: Callable[[DataType], np.generic] | None = ..., +) -> BlockManager: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/pyarrow-stubs/parquet/__init__.pyi b/pyarrow-stubs/parquet/__init__.pyi new file mode 100644 index 00000000000..bb67a43fa4e --- /dev/null +++ b/pyarrow-stubs/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi new file mode 100644 index 00000000000..d00ef07f6f9 --- /dev/null +++ b/pyarrow-stubs/parquet/core.pyi @@ -0,0 +1,403 @@ +from io import IOBase +from os import PathLike +from typing import ( + Generator, + Literal, +) + +from _typeshed import Incomplete +import pyarrow +from pyarrow._parquet import ( + ColumnChunkMetaData as ColumnChunkMetaData, + ColumnSchema as ColumnSchema, + FileDecryptionProperties as FileDecryptionProperties, + FileEncryptionProperties as FileEncryptionProperties, + FileMetaData as FileMetaData, + ParquetLogicalType as ParquetLogicalType, + ParquetReader as ParquetReader, + ParquetSchema as ParquetSchema, + RowGroupMetaData as RowGroupMetaData, + Statistics as Statistics, +) +from pyarrow.compute import Expression +from pyarrow.fs import FileSystem + +def filters_to_expression( + filters: list[tuple[str, ...]] | list[tuple[tuple[str, ...], ...]] +) -> Expression: ... + +class ParquetFile: + reader: ParquetReader + common_metadata: FileMetaData | None + def __init__( + self, + source: str | PathLike | pyarrow.NativeFile | IOBase, + *, + metadata: FileMetaData | None = ..., + common_metadata: FileMetaData | None = ..., + read_dictionary: list[str] | None = ..., + memory_map: bool = ..., + buffer_size: int = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., + decryption_properties: FileDecryptionProperties | None = ..., + thrift_string_size_limit: int | None = ..., + thrift_container_size_limit: int | None = ..., + ) -> None: ... + def __enter__(self) -> ParquetFile: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData | None: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def schema_arrow(self) -> pyarrow.Schema: ... + @property + def num_row_groups(self) -> int: ... + def close(self, force: bool = ...) -> None: ... + @property + def closed(self) -> bool: ... + def read_row_group( + self, + i: int, + columns: list[str] | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ) -> pyarrow.Table: ... + def read_row_groups( + self, + row_groups: list[str], + columns: list[str] | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ) -> pyarrow.Table: ... + def iter_batches( + self, + batch_size: int = ..., + row_groups: list[str] | None = ..., + columns: list[str] | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ) -> Generator[pyarrow.RecordBatch, None, None]: ... + def read( + self, + columns: list[str] | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ) -> pyarrow.Table: ... + def scan_contents( + self, columns: list[int] | None = ..., batch_size: int = ... + ) -> int: ... + +_COMPRESSION = Literal["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"] + +class ParquetWriter: + flavor: Literal["spark"] | None + schema_changed: bool + schema: pyarrow.Schema + where: str | PathLike | IOBase + file_handle: Incomplete + writer: Incomplete + is_open: bool + def __init__( + self, + where: str | PathLike | IOBase, + schema: pyarrow.Schema, + filesystem: FileSystem | None = ..., + flavor: Literal["spark"] | None = ..., + version: str = ..., + use_dictionary: bool | list[str] = ..., + compression: _COMPRESSION | dict[str, _COMPRESSION] = ..., + write_statistics: bool | list[bool] = ..., + use_deprecated_int96_timestamps: bool | None = ..., + compression_level: int | dict[str, int] | None = ..., + use_byte_stream_split: bool | list[str] = ..., + column_encoding: str | dict[str, str] | None = ..., + writer_engine_version: str | None = ..., + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = ..., + encryption_properties: FileEncryptionProperties | None = ..., + write_batch_size: int | None = ..., + dictionary_pagesize_limit: int | None = ..., + **options, + ) -> None: ... + def __del__(self) -> None: ... + def __enter__(self): ... + def __exit__(self, *args, **kwargs): ... + def write( + self, table_or_batch, row_group_size: Incomplete | None = ... + ) -> None: ... + def write_batch(self, batch, row_group_size: Incomplete | None = ...) -> None: ... + def write_table(self, table, row_group_size: Incomplete | None = ...) -> None: ... + def close(self) -> None: ... + +class ParquetDatasetPiece: + def __init__( + self, + path, + open_file_func=..., + file_options: Incomplete | None = ..., + row_group: Incomplete | None = ..., + partition_keys: Incomplete | None = ..., + ) -> None: ... + def __eq__(self, other): ... + def get_metadata(self): ... + def open(self): ... + def read( + self, + columns: Incomplete | None = ..., + use_threads: bool = ..., + partitions: Incomplete | None = ..., + file: Incomplete | None = ..., + use_pandas_metadata: bool = ..., + ): ... + +class PartitionSet: + name: Incomplete + keys: Incomplete + key_indices: Incomplete + def __init__(self, name, keys: Incomplete | None = ...) -> None: ... + def get_index(self, key): ... + @property + def dictionary(self): ... + @property + def is_sorted(self): ... + +class ParquetPartitions: + levels: Incomplete + partition_names: Incomplete + def __init__(self) -> None: ... + def __len__(self) -> int: ... + def __getitem__(self, i): ... + def equals(self, other): ... + def __eq__(self, other): ... + def get_index(self, level, name, key): ... + def filter_accepts_partition(self, part_key, filter, level): ... + +class ParquetManifest: + filesystem: Incomplete + open_file_func: Incomplete + pathsep: Incomplete + dirpath: Incomplete + partition_scheme: Incomplete + partitions: Incomplete + pieces: Incomplete + common_metadata_path: Incomplete + metadata_path: Incomplete + def __init__( + self, + dirpath, + open_file_func: Incomplete | None = ..., + filesystem: Incomplete | None = ..., + pathsep: str = ..., + partition_scheme: str = ..., + metadata_nthreads: int = ..., + ) -> None: ... + +class _ParquetDatasetMetadata: ... + +class ParquetDataset: + __doc__: Incomplete + def __new__( + cls, + path_or_paths: Incomplete | None = ..., + filesystem: Incomplete | None = ..., + schema: Incomplete | None = ..., + metadata: Incomplete | None = ..., + split_row_groups: bool = ..., + validate_schema: bool = ..., + filters: Incomplete | None = ..., + metadata_nthreads: Incomplete | None = ..., + read_dictionary: Incomplete | None = ..., + memory_map: bool = ..., + buffer_size: int = ..., + partitioning: str = ..., + use_legacy_dataset: Incomplete | None = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: Incomplete | None = ..., + thrift_string_size_limit: Incomplete | None = ..., + thrift_container_size_limit: Incomplete | None = ..., + ): ... + paths: Incomplete + split_row_groups: Incomplete + def __init__( + self, + path_or_paths, + filesystem: Incomplete | None = ..., + schema: Incomplete | None = ..., + metadata: Incomplete | None = ..., + split_row_groups: bool = ..., + validate_schema: bool = ..., + filters: Incomplete | None = ..., + metadata_nthreads: Incomplete | None = ..., + read_dictionary: Incomplete | None = ..., + memory_map: bool = ..., + buffer_size: int = ..., + partitioning: str = ..., + use_legacy_dataset: bool = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: Incomplete | None = ..., + thrift_string_size_limit: Incomplete | None = ..., + thrift_container_size_limit: Incomplete | None = ..., + ) -> None: ... + def equals(self, other): ... + def __eq__(self, other): ... + def validate_schemas(self) -> None: ... + def read( + self, + columns: Incomplete | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ): ... + def read_pandas(self, **kwargs): ... + @property + def pieces(self): ... + @property + def partitions(self): ... + @property + def schema(self): ... + @property + def memory_map(self): ... + @property + def read_dictionary(self): ... + @property + def buffer_size(self): ... + @property + def fs(self): ... + @property + def metadata(self): ... + @property + def metadata_path(self): ... + @property + def common_metadata_path(self): ... + @property + def common_metadata(self): ... + @property + def fragments(self) -> None: ... + @property + def files(self) -> None: ... + @property + def filesystem(self) -> None: ... + @property + def partitioning(self) -> None: ... + +class _ParquetDatasetV2: + def __init__( + self, + path_or_paths, + filesystem: Incomplete | None = ..., + *, + filters: Incomplete | None = ..., + partitioning: str = ..., + read_dictionary: Incomplete | None = ..., + buffer_size: Incomplete | None = ..., + memory_map: bool = ..., + ignore_prefixes: Incomplete | None = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: Incomplete | None = ..., + schema: Incomplete | None = ..., + decryption_properties: Incomplete | None = ..., + thrift_string_size_limit: Incomplete | None = ..., + thrift_container_size_limit: Incomplete | None = ..., + **kwargs, + ) -> None: ... + @property + def schema(self): ... + def read( + self, + columns: Incomplete | None = ..., + use_threads: bool = ..., + use_pandas_metadata: bool = ..., + ): ... + def read_pandas(self, **kwargs): ... + @property + def pieces(self): ... + @property + def fragments(self): ... + @property + def files(self): ... + @property + def filesystem(self): ... + @property + def partitioning(self): ... + +def read_table( + source, + *, + columns: Incomplete | None = ..., + use_threads: bool = ..., + metadata: Incomplete | None = ..., + schema: Incomplete | None = ..., + use_pandas_metadata: bool = ..., + memory_map: bool = ..., + read_dictionary: Incomplete | None = ..., + filesystem: Incomplete | None = ..., + filters: Incomplete | None = ..., + buffer_size: int = ..., + partitioning: str = ..., + use_legacy_dataset: bool = ..., + ignore_prefixes: Incomplete | None = ..., + pre_buffer: bool = ..., + coerce_int96_timestamp_unit: Incomplete | None = ..., + decryption_properties: Incomplete | None = ..., + thrift_string_size_limit: Incomplete | None = ..., + thrift_container_size_limit: Incomplete | None = ..., +): ... +def read_pandas(source, columns: Incomplete | None = ..., **kwargs): ... +def write_table( + table, + where, + row_group_size: Incomplete | None = ..., + version: str = ..., + use_dictionary: bool = ..., + compression: str = ..., + write_statistics: bool = ..., + use_deprecated_int96_timestamps: Incomplete | None = ..., + coerce_timestamps: Incomplete | None = ..., + allow_truncated_timestamps: bool = ..., + data_page_size: Incomplete | None = ..., + flavor: Incomplete | None = ..., + filesystem: Incomplete | None = ..., + compression_level: Incomplete | None = ..., + use_byte_stream_split: bool = ..., + column_encoding: Incomplete | None = ..., + data_page_version: str = ..., + use_compliant_nested_type: bool = ..., + encryption_properties: Incomplete | None = ..., + write_batch_size: Incomplete | None = ..., + dictionary_pagesize_limit: Incomplete | None = ..., + **kwargs, +) -> None: ... +def write_to_dataset( + table, + root_path, + partition_cols: Incomplete | None = ..., + partition_filename_cb: Incomplete | None = ..., + filesystem: Incomplete | None = ..., + use_legacy_dataset: Incomplete | None = ..., + schema: Incomplete | None = ..., + partitioning: Incomplete | None = ..., + basename_template: Incomplete | None = ..., + use_threads: Incomplete | None = ..., + file_visitor: Incomplete | None = ..., + existing_data_behavior: Incomplete | None = ..., + **kwargs, +) -> None: ... +def write_metadata( + schema, where, metadata_collector: Incomplete | None = ..., **kwargs +) -> None: ... +def read_metadata( + where, + memory_map: bool = ..., + decryption_properties: Incomplete | None = ..., + filesystem: Incomplete | None = ..., +): ... +def read_schema( + where, + memory_map: bool = ..., + decryption_properties: Incomplete | None = ..., + filesystem: Incomplete | None = ..., +): ... + +# Names in __all__ with no definition: +# _filters_to_expression diff --git a/pyarrow-stubs/parquet/encryption.pyi b/pyarrow-stubs/parquet/encryption.pyi new file mode 100644 index 00000000000..53918ce5927 --- /dev/null +++ b/pyarrow-stubs/parquet/encryption.pyi @@ -0,0 +1,7 @@ +from pyarrow._parquet_encryption import ( + CryptoFactory as CryptoFactory, + DecryptionConfiguration as DecryptionConfiguration, + EncryptionConfiguration as EncryptionConfiguration, + KmsClient as KmsClient, + KmsConnectionConfig as KmsConnectionConfig, +) diff --git a/pyarrow-stubs/plasma.pyi b/pyarrow-stubs/plasma.pyi new file mode 100644 index 00000000000..c3407adf769 --- /dev/null +++ b/pyarrow-stubs/plasma.pyi @@ -0,0 +1,28 @@ +from collections.abc import Generator +from subprocess import Popen +from types import ModuleType + +from pyarrow._plasma import ( + ObjectID as ObjectID, + ObjectNotAvailable as ObjectNotAvailable, + PlasmaBuffer as PlasmaBuffer, + PlasmaClient as PlasmaClient, + PlasmaObjectExists as PlasmaObjectExists, + PlasmaObjectNotFound as PlasmaObjectNotFound, + PlasmaStoreFull as PlasmaStoreFull, + connect as connect, +) + +TF_PLASMA_OP_PATH: str +tf_plasma_op: ModuleType | None + +def load_plasma_tensorflow_op() -> None: ... +def build_plasma_tensorflow_op() -> None: ... +def start_plasma_store( + plasma_store_memory: int, + use_valgrind: bool = ..., + use_profiler: bool = ..., + plasma_directory: str | None = ..., + use_hugepages: bool = ..., + external_store: str | None = ..., +) -> Generator[tuple[str, Popen[str]], None, None]: ... diff --git a/pyarrow-stubs/py.typed b/pyarrow-stubs/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pyarrow-stubs/serialization.pyi b/pyarrow-stubs/serialization.pyi new file mode 100644 index 00000000000..1b992aebdd7 --- /dev/null +++ b/pyarrow-stubs/serialization.pyi @@ -0,0 +1,18 @@ +from pyarrow.lib import ( + SerializationContext as SerializationContext, + builtin_pickle as builtin_pickle, + py_buffer as py_buffer, +) + +try: + import cloudpickle +except ImportError: + cloudpickle = builtin_pickle + +def register_torch_serialization_handlers( + serialization_context: SerializationContext, +): ... +def register_default_serialization_handlers( + serialization_context: SerializationContext, +) -> None: ... +def default_serialization_context() -> SerializationContext: ... diff --git a/pyarrow-stubs/substrait.pyi b/pyarrow-stubs/substrait.pyi new file mode 100644 index 00000000000..c4b612d38f4 --- /dev/null +++ b/pyarrow-stubs/substrait.pyi @@ -0,0 +1,4 @@ +from pyarrow._substrait import ( + get_supported_functions as get_supported_functions, + run_query as run_query, +) diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi new file mode 100644 index 00000000000..9a545bd12c6 --- /dev/null +++ b/pyarrow-stubs/types.pyi @@ -0,0 +1,53 @@ +from pyarrow.lib import ( + DataType, + is_boolean_value as is_boolean_value, + is_float_value as is_float_value, + is_integer_value as is_integer_value, +) + +def is_null(t: DataType) -> bool: ... +def is_boolean(t: DataType) -> bool: ... +def is_integer(t: DataType) -> bool: ... +def is_signed_integer(t: DataType) -> bool: ... +def is_unsigned_integer(t: DataType) -> bool: ... +def is_int8(t: DataType) -> bool: ... +def is_int16(t: DataType) -> bool: ... +def is_int32(t: DataType) -> bool: ... +def is_int64(t: DataType) -> bool: ... +def is_uint8(t: DataType) -> bool: ... +def is_uint16(t: DataType) -> bool: ... +def is_uint32(t: DataType) -> bool: ... +def is_uint64(t: DataType) -> bool: ... +def is_floating(t: DataType) -> bool: ... +def is_float16(t: DataType) -> bool: ... +def is_float32(t: DataType) -> bool: ... +def is_float64(t: DataType) -> bool: ... +def is_list(t: DataType) -> bool: ... +def is_large_list(t: DataType) -> bool: ... +def is_fixed_size_list(t: DataType) -> bool: ... +def is_struct(t: DataType) -> bool: ... +def is_union(t: DataType) -> bool: ... +def is_nested(t: DataType) -> bool: ... +def is_temporal(t: DataType) -> bool: ... +def is_timestamp(t: DataType) -> bool: ... +def is_duration(t: DataType) -> bool: ... +def is_time(t: DataType) -> bool: ... +def is_time32(t: DataType) -> bool: ... +def is_time64(t: DataType) -> bool: ... +def is_binary(t: DataType) -> bool: ... +def is_large_binary(t: DataType) -> bool: ... +def is_unicode(t: DataType) -> bool: ... +def is_string(t: DataType) -> bool: ... +def is_large_unicode(t: DataType) -> bool: ... +def is_large_string(t: DataType) -> bool: ... +def is_fixed_size_binary(t: DataType) -> bool: ... +def is_date(t: DataType) -> bool: ... +def is_date32(t: DataType) -> bool: ... +def is_date64(t: DataType) -> bool: ... +def is_map(t: DataType) -> bool: ... +def is_decimal(t: DataType) -> bool: ... +def is_decimal128(t: DataType) -> bool: ... +def is_decimal256(t: DataType) -> bool: ... +def is_dictionary(t: DataType) -> bool: ... +def is_interval(t: DataType) -> bool: ... +def is_primitive(t: DataType) -> bool: ... diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi new file mode 100644 index 00000000000..72f919ea328 --- /dev/null +++ b/pyarrow-stubs/util.pyi @@ -0,0 +1,12 @@ +from collections.abc import Sequence +from typing import TypeVar + +_T = TypeVar("_T") + +def implements(f: function): ... +def product(seq: Sequence[_T]) -> _T: ... +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... diff --git a/pyproject.toml b/pyproject.toml index 47eb1ecca26..22960f41162 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,9 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.0a1" +version = "10.0.1.1" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" -readme = "README.md" homepage = "https://github.com/zen-xu/pyarrow-stubs" classifiers = [ "Development Status :: 3 - Alpha", From 49f64d3b9a4f8faf05f62852dce175408dd810c0 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 26 Dec 2022 14:56:34 +0800 Subject: [PATCH 004/231] fix FixedSizeBufferWriter init annotation --- pyarrow-stubs/lib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 40ab550d4c4..8053a2c61d3 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -804,6 +804,7 @@ class FixedSizeBinaryType(DataType[_T]): byte_width: int class FixedSizeBufferWriter(NativeFile): + def __init__(self, buffer: Buffer) -> None: ... def set_memcopy_blocksize(self, blocksize: int) -> None: ... def set_memcopy_threads(self, num_threads: int) -> None: ... def set_memcopy_threshold(self, threshold: int) -> None: ... From e432726a0ebef7796ec97918a3665c9461c6d41b Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 26 Dec 2022 14:57:08 +0800 Subject: [PATCH 005/231] bump 10.0.1.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 22960f41162..26185f6554e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.1" +version = "10.0.1.2" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From 1b330f0cc3e06329eaf1c701dd8adc02eea9a95a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 29 Dec 2022 15:12:34 +0800 Subject: [PATCH 006/231] complete parquet core annotations --- pyarrow-stubs/parquet/core.pyi | 265 +++++++++++++++++---------------- 1 file changed, 135 insertions(+), 130 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index d00ef07f6f9..784a9d9eb5c 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -1,12 +1,22 @@ from io import IOBase from os import PathLike +import pathlib from typing import ( Generator, + Generic, Literal, + TypeVar, ) from _typeshed import Incomplete import pyarrow +from pyarrow import ( + Array, + NativeFile, + RecordBatch, + Schema, + Table, +) from pyarrow._parquet import ( ColumnChunkMetaData as ColumnChunkMetaData, ColumnSchema as ColumnSchema, @@ -20,10 +30,11 @@ from pyarrow._parquet import ( Statistics as Statistics, ) from pyarrow.compute import Expression +from pyarrow.dataset import Partitioning from pyarrow.fs import FileSystem def filters_to_expression( - filters: list[tuple[str, ...]] | list[tuple[tuple[str, ...], ...]] + filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] ) -> Expression: ... class ParquetFile: @@ -122,57 +133,65 @@ class ParquetWriter: **options, ) -> None: ... def __del__(self) -> None: ... - def __enter__(self): ... + def __enter__(self) -> ParquetWriter: ... def __exit__(self, *args, **kwargs): ... def write( - self, table_or_batch, row_group_size: Incomplete | None = ... + self, + table_or_batch: Table | RecordBatch, + row_group_size: int | None = ..., + ) -> None: ... + def write_batch( + self, batch: RecordBatch, row_group_size: int | None = ... ) -> None: ... - def write_batch(self, batch, row_group_size: Incomplete | None = ...) -> None: ... - def write_table(self, table, row_group_size: Incomplete | None = ...) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = ...) -> None: ... def close(self) -> None: ... class ParquetDatasetPiece: def __init__( self, - path, - open_file_func=..., - file_options: Incomplete | None = ..., - row_group: Incomplete | None = ..., - partition_keys: Incomplete | None = ..., + path: str | pathlib.Path, + open_file_func: function = ..., + file_options: dict | None = ..., + row_group: int | None = ..., + partition_keys: list[tuple[str, str]] | None = ..., ) -> None: ... - def __eq__(self, other): ... - def get_metadata(self): ... - def open(self): ... + def __eq__(self, other) -> bool: ... + def get_metadata(self) -> FileMetaData: ... + def open(self) -> ParquetFile: ... def read( self, - columns: Incomplete | None = ..., + columns: list[str] | None = ..., use_threads: bool = ..., - partitions: Incomplete | None = ..., - file: Incomplete | None = ..., + partitions: ParquetPartitions | None = ..., + file: IOBase | None = ..., use_pandas_metadata: bool = ..., - ): ... + ) -> Table: ... + +_K = TypeVar("_K") -class PartitionSet: - name: Incomplete - keys: Incomplete - key_indices: Incomplete - def __init__(self, name, keys: Incomplete | None = ...) -> None: ... - def get_index(self, key): ... +class PartitionSet(Generic[_K]): + name: str + keys: list[_K] + key_indices: dict[_K, int] + def __init__(self, name: str, keys: list[_K] | None = ...) -> None: ... + def get_index(self, key: _K) -> int: ... @property - def dictionary(self): ... + def dictionary(self) -> Array: ... @property - def is_sorted(self): ... + def is_sorted(self) -> bool: ... -class ParquetPartitions: - levels: Incomplete - partition_names: Incomplete +_PPK = TypeVar("_PPK", str, int) + +class ParquetPartitions(Generic[_PPK]): + levels: list[PartitionSet[_PPK]] + partition_names: set[str] def __init__(self) -> None: ... def __len__(self) -> int: ... def __getitem__(self, i): ... - def equals(self, other): ... - def __eq__(self, other): ... - def get_index(self, level, name, key): ... - def filter_accepts_partition(self, part_key, filter, level): ... + def equals(self, other: ParquetPartitions) -> bool: ... + def __eq__(self, other) -> bool: ... + def get_index(self, level: int, name: str, key: _PPK) -> int: ... + def filter_accepts_partition(self, part_key, filter, level: int) -> bool: ... class ParquetManifest: filesystem: Incomplete @@ -197,59 +216,39 @@ class ParquetManifest: class _ParquetDatasetMetadata: ... class ParquetDataset: - __doc__: Incomplete + paths: list[str] + split_row_groups: bool + def __new__( cls, - path_or_paths: Incomplete | None = ..., - filesystem: Incomplete | None = ..., - schema: Incomplete | None = ..., - metadata: Incomplete | None = ..., + path_or_paths: str | list[str] | None = ..., + filesystem: FileSystem | None = ..., + schema: Schema | None = ..., + metadata: FileMetaData | None = ..., split_row_groups: bool = ..., validate_schema: bool = ..., - filters: Incomplete | None = ..., - metadata_nthreads: Incomplete | None = ..., - read_dictionary: Incomplete | None = ..., + filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., + metadata_nthreads: int | None = ..., + read_dictionary: list[str] | None = ..., memory_map: bool = ..., buffer_size: int = ..., partitioning: str = ..., - use_legacy_dataset: Incomplete | None = ..., + use_legacy_dataset: bool | None = ..., pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Incomplete | None = ..., - thrift_string_size_limit: Incomplete | None = ..., - thrift_container_size_limit: Incomplete | None = ..., + coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., + thrift_string_size_limit: int | None = ..., + thrift_container_size_limit: int | None = ..., ): ... - paths: Incomplete - split_row_groups: Incomplete - def __init__( - self, - path_or_paths, - filesystem: Incomplete | None = ..., - schema: Incomplete | None = ..., - metadata: Incomplete | None = ..., - split_row_groups: bool = ..., - validate_schema: bool = ..., - filters: Incomplete | None = ..., - metadata_nthreads: Incomplete | None = ..., - read_dictionary: Incomplete | None = ..., - memory_map: bool = ..., - buffer_size: int = ..., - partitioning: str = ..., - use_legacy_dataset: bool = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Incomplete | None = ..., - thrift_string_size_limit: Incomplete | None = ..., - thrift_container_size_limit: Incomplete | None = ..., - ) -> None: ... - def equals(self, other): ... - def __eq__(self, other): ... + def equals(self, other) -> bool: ... + def __eq__(self, other) -> bool: ... def validate_schemas(self) -> None: ... def read( self, - columns: Incomplete | None = ..., + columns: list[str] | None = ..., use_threads: bool = ..., use_pandas_metadata: bool = ..., - ): ... - def read_pandas(self, **kwargs): ... + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... @property def pieces(self): ... @property @@ -284,32 +283,32 @@ class ParquetDataset: class _ParquetDatasetV2: def __init__( self, - path_or_paths, - filesystem: Incomplete | None = ..., + path_or_paths: str | list[str], + filesystem: FileSystem | None = ..., *, - filters: Incomplete | None = ..., + filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., partitioning: str = ..., - read_dictionary: Incomplete | None = ..., - buffer_size: Incomplete | None = ..., + read_dictionary: list[str] | None = ..., + buffer_size: int | None = ..., memory_map: bool = ..., - ignore_prefixes: Incomplete | None = ..., + ignore_prefixes: list[str] | None = ..., pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Incomplete | None = ..., - schema: Incomplete | None = ..., - decryption_properties: Incomplete | None = ..., + coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., + schema: Schema | None = ..., + decryption_properties: FileDecryptionProperties | None = ..., thrift_string_size_limit: Incomplete | None = ..., thrift_container_size_limit: Incomplete | None = ..., **kwargs, ) -> None: ... @property - def schema(self): ... + def schema(self) -> Schema: ... def read( self, - columns: Incomplete | None = ..., + columns: list[str] | None = ..., use_threads: bool = ..., use_pandas_metadata: bool = ..., - ): ... - def read_pandas(self, **kwargs): ... + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... @property def pieces(self): ... @property @@ -322,82 +321,88 @@ class _ParquetDatasetV2: def partitioning(self): ... def read_table( - source, + source: str | NativeFile | IOBase, *, - columns: Incomplete | None = ..., + columns: list[str] | None = ..., use_threads: bool = ..., - metadata: Incomplete | None = ..., - schema: Incomplete | None = ..., + metadata: FileMetaData | None = ..., + schema: Schema | None = ..., use_pandas_metadata: bool = ..., memory_map: bool = ..., - read_dictionary: Incomplete | None = ..., - filesystem: Incomplete | None = ..., - filters: Incomplete | None = ..., + read_dictionary: list[str] | None = ..., + filesystem: FileSystem | None = ..., + filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., buffer_size: int = ..., partitioning: str = ..., use_legacy_dataset: bool = ..., - ignore_prefixes: Incomplete | None = ..., + ignore_prefixes: list[str] | None = ..., pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Incomplete | None = ..., - decryption_properties: Incomplete | None = ..., - thrift_string_size_limit: Incomplete | None = ..., - thrift_container_size_limit: Incomplete | None = ..., -): ... -def read_pandas(source, columns: Incomplete | None = ..., **kwargs): ... + coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., + decryption_properties: FileDecryptionProperties | None = ..., + thrift_string_size_limit: int | None = ..., + thrift_container_size_limit: int | None = ..., +) -> Table: ... +def read_pandas( + source: str | NativeFile | IOBase, columns: list[str] | None = ..., **kwargs +) -> Table: ... def write_table( - table, - where, - row_group_size: Incomplete | None = ..., + table: Table, + where: str | NativeFile, + row_group_size: int | None = ..., version: str = ..., - use_dictionary: bool = ..., + use_dictionary: bool | list[str] = ..., compression: str = ..., write_statistics: bool = ..., - use_deprecated_int96_timestamps: Incomplete | None = ..., - coerce_timestamps: Incomplete | None = ..., + use_deprecated_int96_timestamps: bool | None = ..., + coerce_timestamps: str | None = ..., allow_truncated_timestamps: bool = ..., - data_page_size: Incomplete | None = ..., - flavor: Incomplete | None = ..., - filesystem: Incomplete | None = ..., - compression_level: Incomplete | None = ..., + data_page_size: int | None = ..., + flavor: Literal["spark"] | None = ..., + filesystem: FileSystem | None = ..., + compression_level: int | dict[str, int] | None = ..., use_byte_stream_split: bool = ..., - column_encoding: Incomplete | None = ..., + column_encoding: str | dict[str, str] | None = ..., data_page_version: str = ..., use_compliant_nested_type: bool = ..., - encryption_properties: Incomplete | None = ..., - write_batch_size: Incomplete | None = ..., - dictionary_pagesize_limit: Incomplete | None = ..., + encryption_properties: FileEncryptionProperties | None = ..., + write_batch_size: int | None = ..., + dictionary_pagesize_limit: int | None = ..., **kwargs, ) -> None: ... def write_to_dataset( - table, - root_path, - partition_cols: Incomplete | None = ..., - partition_filename_cb: Incomplete | None = ..., - filesystem: Incomplete | None = ..., - use_legacy_dataset: Incomplete | None = ..., - schema: Incomplete | None = ..., - partitioning: Incomplete | None = ..., - basename_template: Incomplete | None = ..., - use_threads: Incomplete | None = ..., - file_visitor: Incomplete | None = ..., - existing_data_behavior: Incomplete | None = ..., + table: Table, + root_path: str | pathlib.Path, + partition_cols: list[str] | None = ..., + partition_filename_cb: function | None = ..., + filesystem: FileSystem | None = ..., + use_legacy_dataset: bool | None = ..., + schema: Schema | None = ..., + partitioning: list[str] | Partitioning | None = ..., + basename_template: str | None = ..., + use_threads: bool | None = ..., + file_visitor: function | None = ..., + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = ..., **kwargs, ) -> None: ... def write_metadata( - schema, where, metadata_collector: Incomplete | None = ..., **kwargs + schema: Schema, + where: str | NativeFile, + metadata_collector: list | None = ..., + **kwargs, ) -> None: ... def read_metadata( where, memory_map: bool = ..., - decryption_properties: Incomplete | None = ..., + decryption_properties: FileDecryptionProperties | None = ..., filesystem: Incomplete | None = ..., ): ... def read_schema( - where, + where: str | IOBase, memory_map: bool = ..., - decryption_properties: Incomplete | None = ..., - filesystem: Incomplete | None = ..., -): ... + decryption_properties: FileDecryptionProperties | None = ..., + filesystem: FileSystem | None = ..., +) -> FileMetaData: ... # Names in __all__ with no definition: # _filters_to_expression From eca9a4b3d2ec6e70ec62b10575787025d475ce69 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 29 Dec 2022 15:13:26 +0800 Subject: [PATCH 007/231] bump 10.0.1.3 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 26185f6554e..977cb6fac90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.2" +version = "10.0.1.3" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From f5d26a47e6feae7d6f779d3ff99057cbe1be0c22 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 10:26:04 +0800 Subject: [PATCH 008/231] re-export modules --- pyarrow-stubs/__init__.pyi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index 07069877eec..ab5f8527f6f 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -1,5 +1,13 @@ from typing import Any +from pyarrow import ( + filesystem as filesystem, + hdfs as hdfs, + ipc as ipc, + serialization as serialization, + types as types, + util as util, +) from pyarrow._hdfsio import ( HdfsFile as HdfsFile, have_libhdfs as have_libhdfs, From 697b875daeb0abeb494049a4f3b0608c351df3e7 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 10:26:31 +0800 Subject: [PATCH 009/231] fix: add return type for foreign_buffer --- pyarrow-stubs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 8053a2c61d3..dcb83d347ef 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -2081,7 +2081,7 @@ def field( def float16() -> DataType[float]: ... def float32() -> DataType[float]: ... def float64() -> DataType[float]: ... -def foreign_buffer(address: int, size: int, base: object | None = ...) -> None: ... +def foreign_buffer(address: int, size: int, base: object | None = ...) -> Buffer: ... def from_numpy_dtype(dtype: DTypeLike) -> DataType: ... def frombytes(o: bytes, *, safe: bool = ...) -> str: ... def get_record_batch_size(batch: RecordBatch) -> int: ... From bf0edc06d31d7398533e851c10d18bfa548d1820 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 11:24:10 +0800 Subject: [PATCH 010/231] fix output_stream and read_message annotations --- pyarrow-stubs/lib.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index dcb83d347ef..3025a66565b 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -2125,13 +2125,13 @@ def nulls( size: int, type: DataType[_T] = ..., memory_pool: MemoryPool | None = ... ) -> Array[_T, Scalar[_T]]: ... def output_stream( - source: str | PathLike | Buffer | IOBase, + source: str | PathLike | Buffer | IOBase | memoryview, compression: str | None = ..., buffer_size: int = ..., ) -> NativeFile: ... def proxy_memory_pool(parent: MemoryPool) -> MemoryPool: ... def py_buffer(obj: bytes | memoryview) -> Buffer: ... -def read_message(source: NativeFile | IOBase | Buffer) -> Message: ... +def read_message(source: NativeFile | IOBase | memoryview | Buffer) -> Message: ... def read_record_batch( obj: Message | Buffer | memoryview, schema: Schema, From ba659b38b2fc8fe3d7ba22ceb76caa7849cc189b Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 11:32:19 +0800 Subject: [PATCH 011/231] ci: add release job --- .github/workflows/release.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/release.yaml diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 00000000000..a095f5b46ff --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,23 @@ +name: Release VSCode Server Bin + +on: + push: + tags: + - "*.*.*" + +jobs: + release: + name: "release ${{github.ref_name}}" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.7" + - uses: abatilo/actions-poetry@v2 + with: + poetry-version: "1.1.15" + - name: publish + run: | + poetry build -f wheel + poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }} From f6f53f60fbd8d3f422d37939a51113c506e702e2 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 11:36:23 +0800 Subject: [PATCH 012/231] pre-commit specify flake8 version to 5.0.4 --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dd6ee48335e..2e00dd97334 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: types: [text] # overwrite types: [python] args: [--py38-plus] - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 5.0.4 hooks: - id: flake8 name: flake8 (py) From fecc6ef67924bbbb8b414ec956c7f2795a9ac9fc Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 11:55:08 +0800 Subject: [PATCH 013/231] flake8 ignore F821 for private files --- .pre-commit-config.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e00dd97334..ce680dca3b0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: pyupgrade types_or: [python, pyi] types: [text] # overwrite types: [python] - args: [--py38-plus] + args: [--py37-plus] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: @@ -33,4 +33,6 @@ repos: --ignore=E301 E302 E305 E402 E501 E701 E704 F401 F811 W503 Y019 Y027 Y034 Y037 Y041 Y042, # TypeVars in private files are already private --per-file-ignores=_*.pyi:Y001, + # ignore private stub files + --per-file-ignores=_*.pyi:F821, ] From 3215f5bf33b60b7467e0067593ad0133ed5d1e36 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 11:55:51 +0800 Subject: [PATCH 014/231] optimize annotations --- pyarrow-stubs/_parquet.pyi | 12 ++++++------ pyarrow-stubs/dataset.pyi | 2 +- pyarrow-stubs/filesystem.pyi | 5 +---- pyarrow-stubs/hdfs.pyi | 4 ---- pyarrow-stubs/lib.pyi | 24 +++++++++++++----------- pyarrow-stubs/pandas_compat.pyi | 2 +- pyarrow-stubs/parquet/__init__.pyi | 2 +- pyarrow-stubs/parquet/core.pyi | 14 +++++++++----- pyarrow-stubs/util.pyi | 7 +++++-- 9 files changed, 37 insertions(+), 35 deletions(-) diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 1325a57b5c3..993da5f7cfb 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -187,16 +187,16 @@ class ParquetWriter(pyarrow.lib._Weakrefable): write_statistics: bool | list[str] | None = ..., memory_pool: pyarrow.lib.MemoryPool = ..., use_deprecated_int96_timestamps: bool = ..., - coerce_timestamps: Literal["ms", "us"] | None = None, - data_page_size: int | None = None, + coerce_timestamps: Literal["ms", "us"] | None = ..., + data_page_size: int | None = ..., allow_truncated_timestamps: bool = ..., - compression_level: int | dict[str, int] | None = None, + compression_level: int | dict[str, int] | None = ..., use_byte_stream_split: bool | list[str] = ..., column_encoding: str | dict[str, str] | None = ..., - writer_engine_version: Literal["V1", "V2"] | None = None, - data_page_version: Literal["1.0", "2.0"] | None = None, + writer_engine_version: Literal["V1", "V2"] | None = ..., + data_page_version: Literal["1.0", "2.0"] | None = ..., use_compliant_nested_type: bool = ..., - encryption_properties: FileDecryptionProperties | None = None, + encryption_properties: FileDecryptionProperties | None = ..., write_batch_size: int | None = ..., dictionary_pagesize_limit: int | None = ..., ) -> None: ... diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 68ec9657870..8b40492c7ec 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -2,7 +2,6 @@ from os import PathLike from typing import ( Callable, Iterable, - Literal, ) from pyarrow._dataset import ( @@ -58,6 +57,7 @@ from pyarrow.lib import ( Schema, Table, ) +from typing_extensions import Literal def __getattr__(name: str) -> None: ... def partitioning( diff --git a/pyarrow-stubs/filesystem.pyi b/pyarrow-stubs/filesystem.pyi index 72aa06229c6..ba4295305b9 100644 --- a/pyarrow-stubs/filesystem.pyi +++ b/pyarrow-stubs/filesystem.pyi @@ -1,8 +1,5 @@ from os import PathLike -from typing import ( - Any, - Generator, -) +from typing import Generator from pyarrow import ( Table, diff --git a/pyarrow-stubs/hdfs.pyi b/pyarrow-stubs/hdfs.pyi index a51797ffb46..72481ac3e0b 100644 --- a/pyarrow-stubs/hdfs.pyi +++ b/pyarrow-stubs/hdfs.pyi @@ -1,8 +1,4 @@ from collections.abc import Generator -from typing import ( - Literal, - overload, -) from _typeshed import Incomplete import pyarrow._hdfsio as _hdfsio diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 3025a66565b..f9f8a6a19bd 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -15,10 +15,7 @@ from typing import ( ItemsView, Iterable, KeysView, - Literal, NamedTuple, - TypeAlias, - TypeGuard, TypeVar, ValuesView, overload, @@ -36,6 +33,11 @@ from pyarrow.compute import ( CastOptions, FunctionOptions, ) +from typing_extensions import ( + Literal, + TypeAlias, + TypeGuard, +) _ArrowType: TypeAlias = int | DataType _builtin_slice = slice @@ -85,26 +87,26 @@ V4: importlib._bootstrap.MetadataVersion V5: importlib._bootstrap.MetadataVersion _NULL: NullScalar __pc: ModuleType | None -_break_traceback_cycle_from_frame: function +_break_traceback_cycle_from_frame: Callable _default_context_initialized: bool _default_serialization_context: SerializationContext -_is_path_like: function +_is_path_like: Callable _pandas_api: _PandasAPIShim _python_extension_types_registry: list _registry_nanny: _ExtensionRegistryNanny -_stringify_path: function -contextmanager: function +_stringify_path: Callable +contextmanager: Callable cpp_build_info: importlib._bootstrap.BuildInfo cpp_version: str cpp_version_info: importlib._bootstrap.VersionInfo have_signal_refcycle: bool -namedtuple: function +namedtuple: Callable class PyCapsule: ... _Self = TypeVar("_Self") -_Array = TypeVar("_Array", bound="Array") +_Array = TypeVar("_Array", bound=Array) _ChunkedArray = TypeVar("_ChunkedArray", bound=ChunkedArray) _T = TypeVar("_T") @@ -613,7 +615,7 @@ class ChunkedArray(_PandasConvertible, Generic[_T, _Scalar]): def __len__(self) -> int: ... def __sizeof__(self) -> int: ... -_COMPRESSION = Literal[ +_COMPRESSION: TypeAlias = Literal[ "gzip", "bz2", "brotli", "lz4" "lz4_frame", "lz4_raw", "zstd", "snappy" ] @@ -1914,7 +1916,7 @@ class _WriteStats(NamedTuple): class ordered_dict: def __init__(self, *args, **kwargs) -> None: ... def clear(self, *args, **kwargs) -> Any: ... - def copy(self) -> ashallowcopyofD: ... + def copy(self) -> ashallowcopyofD: ... # noqa @classmethod def fromkeys(cls, *args, **kwargs) -> Any: ... def get(self, *args, **kwargs) -> Any: ... diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi index b7de1fe6b9f..d3567ffbc27 100644 --- a/pyarrow-stubs/pandas_compat.pyi +++ b/pyarrow-stubs/pandas_compat.pyi @@ -1,7 +1,6 @@ from typing import ( Any, Callable, - TypedDict, ) import numpy as np @@ -15,6 +14,7 @@ from pyarrow.lib import ( _ArrowType, frombytes as frombytes, ) +from typing_extensions import TypedDict class _SerializedDict(TypedDict): blocks: list[Any] diff --git a/pyarrow-stubs/parquet/__init__.pyi b/pyarrow-stubs/parquet/__init__.pyi index bb67a43fa4e..151ee188f84 100644 --- a/pyarrow-stubs/parquet/__init__.pyi +++ b/pyarrow-stubs/parquet/__init__.pyi @@ -1 +1 @@ -from .core import * +from .core import * # noqa diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 784a9d9eb5c..b52203ef8e8 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -2,9 +2,9 @@ from io import IOBase from os import PathLike import pathlib from typing import ( + Callable, Generator, Generic, - Literal, TypeVar, ) @@ -32,6 +32,10 @@ from pyarrow._parquet import ( from pyarrow.compute import Expression from pyarrow.dataset import Partitioning from pyarrow.fs import FileSystem +from typing_extensions import ( + Literal, + TypeAlias, +) def filters_to_expression( filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] @@ -100,7 +104,7 @@ class ParquetFile: self, columns: list[int] | None = ..., batch_size: int = ... ) -> int: ... -_COMPRESSION = Literal["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"] +_COMPRESSION: TypeAlias = Literal["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"] class ParquetWriter: flavor: Literal["spark"] | None @@ -150,7 +154,7 @@ class ParquetDatasetPiece: def __init__( self, path: str | pathlib.Path, - open_file_func: function = ..., + open_file_func: Callable = ..., file_options: dict | None = ..., row_group: int | None = ..., partition_keys: list[tuple[str, str]] | None = ..., @@ -373,14 +377,14 @@ def write_to_dataset( table: Table, root_path: str | pathlib.Path, partition_cols: list[str] | None = ..., - partition_filename_cb: function | None = ..., + partition_filename_cb: Callable | None = ..., filesystem: FileSystem | None = ..., use_legacy_dataset: bool | None = ..., schema: Schema | None = ..., partitioning: list[str] | Partitioning | None = ..., basename_template: str | None = ..., use_threads: bool | None = ..., - file_visitor: function | None = ..., + file_visitor: Callable | None = ..., existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] | None = ..., **kwargs, diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi index 72f919ea328..b117b78c62f 100644 --- a/pyarrow-stubs/util.pyi +++ b/pyarrow-stubs/util.pyi @@ -1,9 +1,12 @@ from collections.abc import Sequence -from typing import TypeVar +from typing import ( + Callable, + TypeVar, +) _T = TypeVar("_T") -def implements(f: function): ... +def implements(f: Callable): ... def product(seq: Sequence[_T]) -> _T: ... def get_contiguous_span( shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int From 73235b21779b9a8c4806a828f19be46bcf6c1b0b Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 12:59:26 +0800 Subject: [PATCH 015/231] bump 10.0.1.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 977cb6fac90..c35cb02d267 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.3" +version = "10.0.1.4" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From 3d5857d23c6242f42154f3507534d8cc036ccd89 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 13:42:33 +0800 Subject: [PATCH 016/231] if param supports IOBase, it should also support NativeFile --- pyarrow-stubs/lib.pyi | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index f9f8a6a19bd..bed60d912af 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1061,7 +1061,7 @@ class NativeFile(_Weakrefable): mode: Literal["rb", "wb", "rb+"] def close(self) -> None: ... def download( - self, stream_or_path: str | IOBase, buffer_size: int | None = ... + self, stream_or_path: str | IOBase | NativeFile, buffer_size: int | None = ... ) -> None: ... def fileno(self) -> int: ... def flush(self) -> None: ... @@ -1082,7 +1082,7 @@ class NativeFile(_Weakrefable): def size(self) -> int: ... def tell(self) -> int: ... def truncate(self) -> None: ... - def upload(self, stream: IOBase, buffer_size: int = ...) -> None: ... + def upload(self, stream: IOBase | NativeFile, buffer_size: int = ...) -> None: ... def writable(self) -> bool: ... def write(self, data: bytes | memoryview | Buffer) -> int: ... def writelines(self, lines: list[bytes]) -> None: ... @@ -2092,7 +2092,7 @@ def infer_type( values: Iterable, mask: list[bool] | BooleanArray = ..., from_pandas: bool = ... ) -> DataType: ... def input_stream( - source: str | PathLike | Buffer | IOBase, + source: str | PathLike | Buffer | IOBase | NativeFile, compression: str | None = ..., buffer_size: int | None = ..., ) -> NativeFile: ... @@ -2127,13 +2127,15 @@ def nulls( size: int, type: DataType[_T] = ..., memory_pool: MemoryPool | None = ... ) -> Array[_T, Scalar[_T]]: ... def output_stream( - source: str | PathLike | Buffer | IOBase | memoryview, + source: str | PathLike | Buffer | IOBase | memoryview | NativeFile, compression: str | None = ..., buffer_size: int = ..., ) -> NativeFile: ... def proxy_memory_pool(parent: MemoryPool) -> MemoryPool: ... def py_buffer(obj: bytes | memoryview) -> Buffer: ... -def read_message(source: NativeFile | IOBase | memoryview | Buffer) -> Message: ... +def read_message( + source: NativeFile | IOBase | memoryview | Buffer, +) -> Message: ... def read_record_batch( obj: Message | Buffer | memoryview, schema: Schema, From 108bbd18dea086804ad766f62c075e4b0b59737a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 13:42:47 +0800 Subject: [PATCH 017/231] bump 10.0.1.5 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c35cb02d267..e5129cd0a72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.4" +version = "10.0.1.5" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From 4c9e6c91eb43fe33af5a83589b9d9cb5621ca506 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 17:40:46 +0800 Subject: [PATCH 018/231] pre-commit adds mypy lint --- .pre-commit-config.yaml | 15 ++ check-mypy.sh | 2 + pyarrow-stubs/__init__.pyi | 17 +- pyarrow-stubs/_compute.pyi | 246 ++++++++------------------ pyarrow-stubs/_compute_docstrings.pyi | 2 +- pyarrow-stubs/_csv.pyi | 36 +--- pyarrow-stubs/_dataset.pyi | 53 +----- pyarrow-stubs/_dataset_orc.pyi | 1 - pyarrow-stubs/_dataset_parquet.pyi | 23 +-- pyarrow-stubs/_exec_plan.pyi | 1 - pyarrow-stubs/_feather.pyi | 2 +- pyarrow-stubs/_flight.pyi | 197 +++++---------------- pyarrow-stubs/_fs.pyi | 49 +---- pyarrow-stubs/_gcsfs.pyi | 63 +------ pyarrow-stubs/_hdfs.pyi | 5 +- pyarrow-stubs/_hdfsio.pyi | 9 +- pyarrow-stubs/_orc.pyi | 4 +- pyarrow-stubs/_parquet.pyi | 26 +-- pyarrow-stubs/_parquet_encryption.pyi | 7 +- pyarrow-stubs/_plasma.pyi | 8 +- pyarrow-stubs/_s3fs.pyi | 2 +- pyarrow-stubs/feather.pyi | 6 +- pyarrow-stubs/filesystem.pyi | 4 +- pyarrow-stubs/hdfs.pyi | 2 +- pyarrow-stubs/lib.pyi | 17 +- pyarrow-stubs/orc.pyi | 12 +- pyarrow-stubs/serialization.pyi | 2 +- pyproject.toml | 3 + 28 files changed, 231 insertions(+), 583 deletions(-) create mode 100755 check-mypy.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ce680dca3b0..e0e0eac750c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,22 @@ minimum_pre_commit_version: 2.15.0 +default_language_version: + python: python3.7 ci: autofix_prs: false repos: + - repo: local + hooks: + - id: mypy + name: mypy + entry: ./check-mypy.sh + language: python + types_or: [python, pyi] + require_serial: true + additional_dependencies: + - mypy + - types-cffi + - numpy + - pandas-stubs - repo: https://github.com/python/black rev: 22.12.0 hooks: diff --git a/check-mypy.sh b/check-mypy.sh new file mode 100755 index 00000000000..74ba4bb1e9d --- /dev/null +++ b/check-mypy.sh @@ -0,0 +1,2 @@ +#! /bin/bash +mypy pyarrow-stubs \ No newline at end of file diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index ab5f8527f6f..b36f56bda03 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -1,13 +1,5 @@ from typing import Any -from pyarrow import ( - filesystem as filesystem, - hdfs as hdfs, - ipc as ipc, - serialization as serialization, - types as types, - util as util, -) from pyarrow._hdfsio import ( HdfsFile as HdfsFile, have_libhdfs as have_libhdfs, @@ -264,6 +256,15 @@ from pyarrow.serialization import ( register_torch_serialization_handlers as register_torch_serialization_handlers, ) +from . import ( + filesystem as filesystem, + hdfs as hdfs, + ipc as ipc, + serialization as serialization, + types as types, + util as util, +) + def show_versions() -> None: ... def show_info() -> None: ... def __getattr__(name: str) -> Any: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index 7f5f9847d05..bfa582b79c2 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -1,12 +1,13 @@ from typing import ( Any, + Callable, ClassVar, - Literal, ) import pyarrow.lib +from typing_extensions import Literal -namedtuple: function +namedtuple: Callable class ArraySortOptions(_ArraySortOptions): def __init__( @@ -106,14 +107,13 @@ class Expression(pyarrow.lib._Weakrefable): def __truediv__(self, other) -> Any: ... class ExtractRegexOptions(_ExtractRegexOptions): - def __init__(self, pattern) -> Any: ... + def __init__(self, pattern) -> None: ... class FilterOptions(_FilterOptions): - def __init__(self, null_selection_behavior=...) -> Any: ... + def __init__(self, null_selection_behavior=...) -> None: ... class Function(pyarrow.lib._Weakrefable): _kind_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... _doc: Any arity: Any kind: Any @@ -143,11 +143,9 @@ class FunctionDoc(tuple): def _make(cls, *args, **kwargs) -> Any: ... class FunctionOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore __slots__: ClassVar[tuple] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def deserialize(self, buf) -> Any: ... def serialize(self) -> Any: ... def __eq__(self, other) -> Any: ... @@ -167,23 +165,20 @@ class FunctionRegistry(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class HashAggregateFunction(Function): - __pyx_vtable__: ClassVar[PyCapsule] = ... kernels: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class HashAggregateKernel(Kernel): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class IndexOptions(_IndexOptions): - def __init__(self, value) -> Any: ... + def __init__(self, value) -> None: ... class JoinOptions(_JoinOptions): - def __init__(self, null_handling=..., null_replacement=...) -> Any: ... + def __init__(self, null_handling=..., null_replacement=...) -> None: ... class Kernel(pyarrow.lib._Weakrefable): def __init__(self, *args, **kwargs) -> None: ... @@ -194,16 +189,15 @@ class MakeStructOptions(_MakeStructOptions): def __init__(self, *args, **kwargs) -> None: ... class MapLookupOptions(_MapLookupOptions): - def __init__(self, query_key, occurrence) -> Any: ... + def __init__(self, query_key, occurrence) -> None: ... class MatchSubstringOptions(_MatchSubstringOptions): def __init__(self, *args, **kwargs) -> None: ... class MetaFunction(Function): - __pyx_vtable__: ClassVar[PyCapsule] = ... kernels: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class ModeOptions(_ModeOptions): def __init__(self, *args, **kwargs) -> None: ... @@ -212,7 +206,7 @@ class NullOptions(_NullOptions): def __init__(self, *args, **kwargs) -> None: ... class PadOptions(_PadOptions): - def __init__(self, width, padding=...) -> Any: ... + def __init__(self, width, padding=...) -> None: ... class PartitionNthOptions(_PartitionNthOptions): def __init__(self, *args, **kwargs) -> None: ... @@ -227,30 +221,26 @@ class RankOptions(_RankOptions): def __init__(self, *args, **kwargs) -> None: ... class ReplaceSliceOptions(_ReplaceSliceOptions): - def __init__(self, start, stop, replacement) -> Any: ... + def __init__(self, start, stop, replacement) -> None: ... class ReplaceSubstringOptions(_ReplaceSubstringOptions): def __init__(self, *args, **kwargs) -> None: ... class RoundOptions(_RoundOptions): - def __init__(self, ndigits=..., round_mode=...) -> Any: ... + def __init__(self, ndigits=..., round_mode=...) -> None: ... class RoundTemporalOptions(_RoundTemporalOptions): def __init__(self, *args, **kwargs) -> None: ... class RoundToMultipleOptions(_RoundToMultipleOptions): - def __init__(self, multiple=..., round_mode=...) -> Any: ... + def __init__(self, multiple=..., round_mode=...) -> None: ... class ScalarAggregateFunction(Function): - __pyx_vtable__: ClassVar[PyCapsule] = ... kernels: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class ScalarAggregateKernel(Kernel): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -258,20 +248,16 @@ class ScalarAggregateOptions(_ScalarAggregateOptions): def __init__(self, *args, **kwargs) -> None: ... class ScalarFunction(Function): - __pyx_vtable__: ClassVar[PyCapsule] = ... kernels: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class ScalarKernel(Kernel): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ScalarUdfContext(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... batch_length: Any memory_pool: Any def __init__(self, *args, **kwargs) -> None: ... @@ -279,13 +265,13 @@ class ScalarUdfContext(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class SelectKOptions(_SelectKOptions): - def __init__(self, k, sort_keys) -> Any: ... + def __init__(self, k, sort_keys) -> None: ... class SetLookupOptions(_SetLookupOptions): def __init__(self, *args, **kwargs) -> None: ... class SliceOptions(_SliceOptions): - def __init__(self, start, stop=..., step=...) -> Any: ... + def __init__(self, start, stop=..., step=...) -> None: ... class SortOptions(_SortOptions): def __init__(self, *args, **kwargs) -> None: ... @@ -297,13 +283,13 @@ class SplitPatternOptions(_SplitPatternOptions): def __init__(self, *args, **kwargs) -> None: ... class StrftimeOptions(_StrftimeOptions): - def __init__(self, format=..., locale=...) -> Any: ... + def __init__(self, format=..., locale=...) -> None: ... class StrptimeOptions(_StrptimeOptions): - def __init__(self, format, unit, error_is_null=...) -> Any: ... + def __init__(self, format, unit, error_is_null=...) -> None: ... class StructFieldOptions(_StructFieldOptions): - def __init__(self, indices) -> Any: ... + def __init__(self, indices) -> None: ... class TDigestOptions(_TDigestOptions): def __init__(self, *args, **kwargs) -> None: ... @@ -312,24 +298,22 @@ class TakeOptions(_TakeOptions): def __init__(self, *args, **kwargs) -> None: ... class TrimOptions(_TrimOptions): - def __init__(self, characters) -> Any: ... + def __init__(self, characters) -> None: ... class Utf8NormalizeOptions(_Utf8NormalizeOptions): - def __init__(self, form) -> Any: ... + def __init__(self, form) -> None: ... class VarianceOptions(_VarianceOptions): def __init__(self, *args, **kwargs) -> None: ... class VectorFunction(Function): - __pyx_vtable__: ClassVar[PyCapsule] = ... kernels: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class VectorKernel(Kernel): - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -337,9 +321,8 @@ class WeekOptions(_WeekOptions): def __init__(self, *args, **kwargs) -> None: ... class _ArraySortOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, order, null_placement) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -347,15 +330,13 @@ class _ArraySortOptions(FunctionOptions): class _AssumeTimezoneOptions(FunctionOptions): _ambiguous_map: ClassVar[dict] = ... _nonexistent_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, timezone, ambiguous, nonexistent) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _CastOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... allow_decimal_truncate: Any allow_float_truncate: Any allow_int_overflow: Any @@ -363,7 +344,7 @@ class _CastOptions(FunctionOptions): allow_time_overflow: Any allow_time_truncate: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options( self, DataTypetarget_type, @@ -383,191 +364,148 @@ class _CastOptions(FunctionOptions): class _CountOptions(FunctionOptions): _mode_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, mode) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _CumulativeSumOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, start, skip_nulls) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _DayOfWeekOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, count_from_zero, week_start) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _DictionaryEncodeOptions(FunctionOptions): _null_encoding_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, null_encoding) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ElementWiseAggregateOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, skip_nulls) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ExtractRegexOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, pattern) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _FilterOptions(FunctionOptions): _null_selection_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, null_selection_behavior) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _IndexOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, scalar) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _JoinOptions(FunctionOptions): _null_handling_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, null_handling, null_replacement) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _MakeStructOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, field_names, field_nullability, field_metadata) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _MapLookupOptions(FunctionOptions): _occurrence_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, query_key, occurrence) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _MatchSubstringOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, pattern, ignore_case) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ModeOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, n, skip_nulls, min_count) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _NullOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, nan_is_null) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _PadOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, width, padding) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _PartitionNthOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, pivot, null_placement) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _QuantileOptions(FunctionOptions): _interp_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, quantiles, interp, skip_nulls, min_count) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _RandomOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, initializer) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _RankOptions(FunctionOptions): _tiebreaker_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, sort_keys, null_placement, tiebreaker) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ReplaceSliceOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, start, stop, replacement) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ReplaceSubstringOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, pattern, replacement, max_replacements) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _RoundOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, ndigits, round_mode) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _RoundTemporalOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options( self, multiple, @@ -580,98 +518,74 @@ class _RoundTemporalOptions(FunctionOptions): def __setstate__(self, state) -> Any: ... class _RoundToMultipleOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, multiple, round_mode) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _ScalarAggregateOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, skip_nulls, min_count) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SelectKOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, k, sort_keys) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SetLookupOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, value_set, boolskip_nulls) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SliceOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, start, stop, step) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SortOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, sort_keys, null_placement) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SplitOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, max_splits, reverse) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _SplitPatternOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, pattern, max_splits, reverse) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _StrftimeOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, format, locale) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _StrptimeOptions(FunctionOptions): _unit_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, format, unit, error_is_null) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _StructFieldOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, indices) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _TDigestOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options( self, quantiles, delta, buffer_size, skip_nulls, min_count ) -> Any: ... @@ -679,42 +593,32 @@ class _TDigestOptions(FunctionOptions): def __setstate__(self, state) -> Any: ... class _TakeOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, boundscheck) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _TrimOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, characters) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _Utf8NormalizeOptions(FunctionOptions): _form_map: ClassVar[dict] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, form) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _VarianceOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options(self, ddof, skip_nulls, min_count) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _WeekOptions(FunctionOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_options( self, week_starts_monday, count_from_zero, first_week_is_fully_in_year ) -> Any: ... @@ -722,10 +626,10 @@ class _WeekOptions(FunctionOptions): def __setstate__(self, state) -> Any: ... class ordered_dict: - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore def __init__(self, *args, **kwargs) -> None: ... def clear(self, *args, **kwargs) -> Any: ... - def copy(self) -> ashallowcopyofD: ... + def copy(self) -> dict: ... @classmethod def fromkeys(cls, *args, **kwargs) -> Any: ... def get(self, *args, **kwargs) -> Any: ... diff --git a/pyarrow-stubs/_compute_docstrings.pyi b/pyarrow-stubs/_compute_docstrings.pyi index d7c52c46cb1..393ad543dc9 100644 --- a/pyarrow-stubs/_compute_docstrings.pyi +++ b/pyarrow-stubs/_compute_docstrings.pyi @@ -1,4 +1,4 @@ -from typing import TypedDict +from typing_extensions import TypedDict class _FunctionDocAdditions(TypedDict): filter: str diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 4f05ae2f86a..950061fc1b9 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -1,11 +1,9 @@ -import collections.abc from typing import ( Any, ClassVar, overload, ) -import _abc import pyarrow.lib ISO8601: _ISO8601 @@ -13,7 +11,6 @@ _stringify_path: function namedtuple: function class CSVStreamingReader(pyarrow.lib.RecordBatchReader): - __pyx_vtable__: ClassVar[PyCapsule] = ... schema: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... @@ -25,8 +22,7 @@ class CSVWriter(pyarrow.lib._CRecordBatchWriter): def __setstate__(self, state) -> Any: ... class ConvertOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore __slots__: ClassVar[tuple] = ... auto_dict_encode: Any auto_dict_max_cardinality: Any @@ -58,23 +54,8 @@ class ConvertOptions(pyarrow.lib._Weakrefable): class InvalidRow(_InvalidRow): __slots__: ClassVar[tuple] = ... -class Mapping(collections.abc.Collection): - _abc_impl: ClassVar[_abc._abc_data] = ... - get: ClassVar[function] = ... - items: ClassVar[function] = ... - keys: ClassVar[function] = ... - values: ClassVar[function] = ... - __abstractmethods__: ClassVar[frozenset] = ... - __contains__: ClassVar[function] = ... - __eq__: ClassVar[function] = ... - __getitem__: ClassVar[function] = ... - __hash__: ClassVar[None] = ... - __reversed__: ClassVar[None] = ... - __slots__: ClassVar[tuple] = ... - class ParseOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore __slots__: ClassVar[tuple] = ... delimiter: Any double_quote: Any @@ -98,13 +79,12 @@ class ParseOptions(pyarrow.lib._Weakrefable): def __setstate_cython__(self, __pyx_state) -> Any: ... class ReadOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore __slots__: ClassVar[tuple] = ... autogenerate_column_names: Any block_size: Any column_names: Any - encoding: encoding + encoding: Any skip_rows: Any skip_rows_after_names: Any use_threads: Any @@ -124,8 +104,7 @@ class ReadOptions(pyarrow.lib._Weakrefable): class SignalStopHandler: stop_token: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _init_signals(self) -> Any: ... def __enter__(self) -> Any: ... def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... @@ -133,7 +112,6 @@ class SignalStopHandler: def __setstate__(self, state) -> Any: ... class WriteOptions(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... batch_size: Any delimiter: Any @@ -144,10 +122,10 @@ class WriteOptions(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class _ISO8601(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore __slots__: ClassVar[tuple] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __eq__(self, other) -> Any: ... def __ge__(self, other) -> Any: ... def __gt__(self, other) -> Any: ... diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index ae6f0664653..03f11c1a8ac 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -1,4 +1,4 @@ -import importlib._bootstrap +import importlib._bootstrap # type: ignore from typing import ( Any, ClassVar, @@ -20,9 +20,8 @@ _stringify_path: function class ArrowTypeError(TypeError, pyarrow.lib.ArrowException): ... class CsvFileFormat(FileFormat): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... - _read_options_py: _read_options_py + _read_options_py: Any parse_options: Any def __init__(self, *args, **kwargs) -> None: ... def equals(self, CsvFileFormatother) -> Any: ... @@ -30,14 +29,12 @@ class CsvFileFormat(FileFormat): def __reduce__(self) -> Any: ... class CsvFileWriteOptions(FileWriteOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... write_options: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class CsvFragmentScanOptions(FragmentScanOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... convert_options: Any read_options: Any @@ -46,7 +43,6 @@ class CsvFragmentScanOptions(FragmentScanOptions): def __reduce__(self) -> Any: ... class Dataset(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... partition_expression: Any schema: Any def __init__(self, *args, **kwargs) -> None: ... @@ -70,8 +66,6 @@ class Dataset(pyarrow.lib._Weakrefable): @overload def scanner(self, columns=...) -> Any: ... @overload - def scanner(self, columns=...) -> Any: ... - @overload def scanner(self, filter=...) -> Any: ... def take(self, indices, **kwargs) -> Any: ... def to_batches(self, **kwargs) -> Any: ... @@ -80,7 +74,6 @@ class Dataset(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class DatasetFactory(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... root_partition: Any def __init__(self, *args, **kwargs) -> None: ... def finish(self, Schemaschema=...) -> Any: ... @@ -90,7 +83,6 @@ class DatasetFactory(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class DirectoryPartitioning(KeyValuePartitioning): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def discover( self, @@ -104,14 +96,11 @@ class DirectoryPartitioning(KeyValuePartitioning): def __setstate__(self, state) -> Any: ... class FeatherFileFormat(IpcFileFormat): - __pyx_vtable__: ClassVar[PyCapsule] = ... default_extname: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class FileFormat(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore default_extname: Any default_fragment_scan_options: Any def __init__(self, *args, **kwargs) -> None: ... @@ -130,18 +119,15 @@ class FileFormat(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FileFragment(Fragment): - __pyx_vtable__: ClassVar[PyCapsule] = ... buffer: Any filesystem: Any format: Any path: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def open(self) -> Any: ... def __reduce__(self) -> Any: ... class FileSystemDataset(Dataset): - __pyx_vtable__: ClassVar[PyCapsule] = ... files: Any filesystem: Any format: Any @@ -161,13 +147,11 @@ class FileSystemDataset(Dataset): def __reduce__(self) -> Any: ... class FileSystemDatasetFactory(DatasetFactory): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FileSystemFactoryOptions(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... exclude_invalid_files: Any partition_base_dir: Any @@ -179,14 +163,12 @@ class FileSystemFactoryOptions(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FileWriteOptions(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... format: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FilenamePartitioning(KeyValuePartitioning): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def discover( self, field_names=..., infer_dictionary=..., schema=..., segment_encoding=... @@ -195,7 +177,6 @@ class FilenamePartitioning(KeyValuePartitioning): def __setstate__(self, state) -> Any: ... class Fragment(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... partition_expression: Any physical_schema: Any def __init__(self, *args, **kwargs) -> None: ... @@ -209,8 +190,7 @@ class Fragment(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FragmentScanOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore type_name: Any def __init__(self, *args, **kwargs) -> None: ... def __eq__(self, other) -> Any: ... @@ -223,7 +203,6 @@ class FragmentScanOptions(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class HivePartitioning(KeyValuePartitioning): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def discover( self, @@ -237,33 +216,28 @@ class HivePartitioning(KeyValuePartitioning): def __setstate__(self, state) -> Any: ... class InMemoryDataset(Dataset): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class IpcFileFormat(FileFormat): - __pyx_vtable__: ClassVar[PyCapsule] = ... default_extname: Any def __init__(self, *args, **kwargs) -> None: ... def equals(self, IpcFileFormatother) -> Any: ... def __reduce__(self) -> Any: ... class IpcFileWriteOptions(FileWriteOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class KeyValuePartitioning(Partitioning): - __pyx_vtable__: ClassVar[PyCapsule] = ... dictionaries: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class Partitioning(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... schema: Any def __init__(self, *args, **kwargs) -> None: ... def parse(self, path) -> Any: ... @@ -271,14 +245,12 @@ class Partitioning(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class PartitioningFactory(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... type_name: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class RecordBatchIterator(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __iter__(self) -> Any: ... def __next__(self) -> Any: ... @@ -286,7 +258,6 @@ class RecordBatchIterator(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class Scanner(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... dataset_schema: Any projected_schema: Any def __init__(self, *args, **kwargs) -> None: ... @@ -331,9 +302,6 @@ class Scanner(pyarrow.lib._Weakrefable): ) -> Any: ... def head(self, intnum_rows) -> Any: ... def scan_batches(self) -> Any: ... - @overload - def take(self, indices) -> Any: ... - @overload def take(self, indices) -> Any: ... def to_batches(self) -> Any: ... def to_reader(self) -> Any: ... @@ -344,7 +312,6 @@ class Scanner(pyarrow.lib._Weakrefable): class TaggedRecordBatch(importlib._bootstrap.TaggedRecordBatch): ... class TaggedRecordBatchIterator(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __iter__(self) -> Any: ... def __next__(self) -> Any: ... @@ -352,21 +319,19 @@ class TaggedRecordBatchIterator(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class UnionDataset(Dataset): - __pyx_vtable__: ClassVar[PyCapsule] = ... children: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... class UnionDatasetFactory(DatasetFactory): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class WrittenFile(pyarrow.lib._Weakrefable): - metadata: metadata - path: path - size: size + metadata: Any + path: Any + size: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi index d4b97475f12..8154e3fd998 100644 --- a/pyarrow-stubs/_dataset_orc.pyi +++ b/pyarrow-stubs/_dataset_orc.pyi @@ -6,7 +6,6 @@ from typing import ( import pyarrow._dataset class OrcFileFormat(pyarrow._dataset.FileFormat): - __pyx_vtable__: ClassVar[PyCapsule] = ... default_extname: Any def __init__(self, *args, **kwargs) -> None: ... def equals(self, OrcFileFormatother) -> Any: ... diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index 746680d4cc0..e9ca579d68c 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -10,13 +10,11 @@ _is_path_like: function _stringify_path: function class ParquetDatasetFactory(pyarrow._dataset.DatasetFactory): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ParquetFactoryOptions(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... partition_base_dir: Any partitioning: Any @@ -27,7 +25,6 @@ class ParquetFactoryOptions(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class ParquetFileFormat(pyarrow._dataset.FileFormat): - __pyx_vtable__: ClassVar[PyCapsule] = ... default_extname: Any read_options: Any def __init__(self, *args, **kwargs) -> None: ... @@ -39,12 +36,10 @@ class ParquetFileFormat(pyarrow._dataset.FileFormat): def __reduce__(self) -> Any: ... class ParquetFileFragment(pyarrow._dataset.FileFragment): - __pyx_vtable__: ClassVar[PyCapsule] = ... metadata: Any num_row_groups: Any row_groups: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def ensure_complete_metadata(self) -> Any: ... def split_by_row_group(self, Expressionfilter=..., Schemaschema=...) -> Any: ... def subset( @@ -53,9 +48,7 @@ class ParquetFileFragment(pyarrow._dataset.FileFragment): def __reduce__(self) -> Any: ... class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _set_arrow_properties(self) -> Any: ... def _set_properties(self) -> Any: ... def update(self, **kwargs) -> Any: ... @@ -63,7 +56,7 @@ class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): def __setstate__(self, state) -> Any: ... class ParquetFragmentScanOptions(pyarrow._dataset.FragmentScanOptions): - __pyx_vtable__: ClassVar[PyCapsule] = ... + __slots__: ClassVar[tuple] = ... buffer_size: Any pre_buffer: Any @@ -77,10 +70,10 @@ class ParquetFragmentScanOptions(pyarrow._dataset.FragmentScanOptions): def __reduce__(self) -> Any: ... class ParquetReadOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - _coerce_int96_timestamp_unit: _coerce_int96_timestamp_unit + __hash__: ClassVar[None] = ... # type: ignore + _coerce_int96_timestamp_unit: Any coerce_int96_timestamp_unit: Any - dictionary_columns: dictionary_columns + dictionary_columns: Any def __init__(self, *args, **kwargs) -> None: ... def equals(self, ParquetReadOptionsother) -> Any: ... def __eq__(self, other) -> Any: ... @@ -93,8 +86,8 @@ class ParquetReadOptions(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class RowGroupInfo: - __hash__: ClassVar[None] = ... - def __init__(self, id, metadata, schema) -> Any: ... + __hash__: ClassVar[None] = ... # type: ignore + def __init__(self, id, metadata, schema) -> None: ... def __eq__(self, other) -> Any: ... @property def num_rows(self) -> Any: ... diff --git a/pyarrow-stubs/_exec_plan.pyi b/pyarrow-stubs/_exec_plan.pyi index 035040ebd10..e2208ff02d5 100644 --- a/pyarrow-stubs/_exec_plan.pyi +++ b/pyarrow-stubs/_exec_plan.pyi @@ -6,7 +6,6 @@ from typing import ( import pyarrow._dataset class InMemoryDataset(pyarrow._dataset.Dataset): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... diff --git a/pyarrow-stubs/_feather.pyi b/pyarrow-stubs/_feather.pyi index 9428e1ff9fa..17f697dac72 100644 --- a/pyarrow-stubs/_feather.pyi +++ b/pyarrow-stubs/_feather.pyi @@ -7,7 +7,7 @@ class FeatherError(Exception): ... class FeatherReader(pyarrow.lib._Weakrefable): version: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def read(self) -> Any: ... def read_indices(self, indices) -> Any: ... def read_names(self, names) -> Any: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 05aed6f47ff..e6a5d1e5027 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -1,5 +1,5 @@ import enum -import importlib._bootstrap +import importlib._bootstrap # type: ignore import re from typing import ( Any, @@ -7,13 +7,13 @@ from typing import ( ) import pyarrow.lib +from pyarrow.lib import Schema _FLIGHT_SERVER_ERROR_REGEX: re.Pattern _get_legacy_format_default: function class Action(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore body: Any type: Any def __init__(self, *args, **kwargs) -> None: ... @@ -33,13 +33,13 @@ class ActionType(importlib._bootstrap._ActionType): def make_action(self, buf) -> Any: ... class ArrowCancelled(pyarrow.lib.ArrowException): - def __init__(self, message, signum=...) -> Any: ... + def __init__(self, message, signum=...) -> None: ... class ArrowException(Exception): ... class ArrowInvalid(ValueError, pyarrow.lib.ArrowException): ... class BasicAuth(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore password: Any username: Any def __init__(self, *args, **kwargs) -> None: ... @@ -58,34 +58,26 @@ class CallInfo(importlib._bootstrap._CallInfo): ... class CertKeyPair(importlib._bootstrap._CertKeyPair): ... class ClientAuthHandler(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def authenticate(self, outgoing, incoming) -> Any: ... def get_token(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ClientAuthReader(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def read(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ClientAuthSender(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def write(self, message) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ClientMiddleware(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def call_completed(self, exception) -> Any: ... def received_headers(self, headers) -> Any: ... def sending_headers(self) -> Any: ... @@ -93,58 +85,26 @@ class ClientMiddleware(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class ClientMiddlewareFactory(pyarrow.lib._Weakrefable): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... def start_call(self, info) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class DescriptorType(enum.Enum): - class _member_type_: - __class__: Any - def __init__(self, *args, **kwargs) -> None: ... - def __delattr__(self, name) -> Any: ... - def __dir__(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __format__(self, *args, **kwargs) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - @classmethod - def __init_subclass__(cls, *args, **kwargs) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __reduce_ex__(self, protocol) -> Any: ... - def __setattr__(self, name, value) -> Any: ... - def __sizeof__(self) -> Any: ... - @classmethod - def __subclasshook__(cls, *args, **kwargs) -> Any: ... - __new__: ClassVar[function] = ... CMD: ClassVar[DescriptorType] = ... PATH: ClassVar[DescriptorType] = ... UNKNOWN: ClassVar[DescriptorType] = ... - _generate_next_value_: ClassVar[function] = ... - _member_map_: ClassVar[dict] = ... - _member_names_: ClassVar[list] = ... - _value2member_map_: ClassVar[dict] = ... class FlightCallOptions(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightCancelledError(FlightError, pyarrow.lib.ArrowCancelled): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightClient(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def authenticate( self, auth_handler, FlightCallOptionsoptions: FlightCallOptions = ... @@ -207,15 +167,12 @@ class FlightClient(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FlightDataStream(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightDescriptor(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore command: Any descriptor_type: Any path: Any @@ -235,7 +192,7 @@ class FlightDescriptor(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FlightEndpoint(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore locations: Any ticket: Any def __init__(self, *args, **kwargs) -> None: ... @@ -252,7 +209,6 @@ class FlightEndpoint(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FlightError(Exception): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... @@ -271,49 +227,24 @@ class FlightInfo(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FlightInternalError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightMetadataReader(pyarrow.lib._Weakrefable): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def read(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightMetadataWriter(pyarrow.lib._Weakrefable): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def write(self, message) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightMethod(enum.Enum): - class _member_type_: - __class__: Any - def __init__(self, *args, **kwargs) -> None: ... - def __delattr__(self, name) -> Any: ... - def __dir__(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __format__(self, *args, **kwargs) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - @classmethod - def __init_subclass__(cls, *args, **kwargs) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __reduce_ex__(self, protocol) -> Any: ... - def __setattr__(self, name, value) -> Any: ... - def __sizeof__(self) -> Any: ... - @classmethod - def __subclasshook__(cls, *args, **kwargs) -> Any: ... - __new__: ClassVar[function] = ... DO_ACTION: ClassVar[FlightMethod] = ... DO_EXCHANGE: ClassVar[FlightMethod] = ... DO_GET: ClassVar[FlightMethod] = ... @@ -324,13 +255,8 @@ class FlightMethod(enum.Enum): INVALID: ClassVar[FlightMethod] = ... LIST_ACTIONS: ClassVar[FlightMethod] = ... LIST_FLIGHTS: ClassVar[FlightMethod] = ... - _generate_next_value_: ClassVar[function] = ... - _member_map_: ClassVar[dict] = ... - _member_names_: ClassVar[list] = ... - _value2member_map_: ClassVar[dict] = ... class FlightServerBase(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... port: Any def __init__(self, *args, **kwargs) -> None: ... def do_action(self, context, action) -> Any: ... @@ -357,77 +283,62 @@ class FlightServerBase(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FlightServerError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightStreamChunk(pyarrow.lib._Weakrefable): app_metadata: Any data: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + + def __init__(self, *args, **kwargs) -> None: ... def __iter__(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightStreamReader(MetadataRecordBatchReader): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def cancel(self) -> Any: ... def read_all(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightStreamWriter(MetadataRecordBatchWriter): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def done_writing(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class FlightTimedOutError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightUnauthenticatedError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightUnauthorizedError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightUnavailableError(FlightError, pyarrow.lib.ArrowException): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce_cython__(self) -> Any: ... def __setstate_cython__(self, __pyx_state) -> Any: ... class FlightWriteSizeExceededError(pyarrow.lib.ArrowInvalid): - def __init__(self, message, limit, actual) -> Any: ... + def __init__(self, message, limit, actual) -> None: ... class GeneratorStream(FlightDataStream): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class Location(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore uri: Any def __init__(self, *args, **kwargs) -> None: ... def equals(self, Locationother) -> Any: ... @@ -444,32 +355,26 @@ class Location(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class MetadataRecordBatchReader(_MetadataRecordBatchReader): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class MetadataRecordBatchWriter(pyarrow.lib._CRecordBatchWriter): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... - def begin(self, Schemaschema: Schema, options=...) -> Any: ... + def __init__(self, *args, **kwargs) -> None: ... + def begin(self, schema: Schema, options=...) -> Any: ... def close(self) -> Any: ... - def write_batch(self, RecordBatchbatch) -> Any: ... def write_metadata(self, buf) -> Any: ... - def write_table(self, Tabletable, max_chunksize=..., **kwargs) -> Any: ... def write_with_metadata(self, RecordBatchbatch, buf) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class RecordBatchStream(FlightDataStream): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class Result(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore body: Any def __init__(self, *args, **kwargs) -> None: ... @classmethod @@ -485,7 +390,7 @@ class Result(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class SchemaResult(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore schema: Any def __init__(self, *args, **kwargs) -> None: ... @classmethod @@ -501,34 +406,26 @@ class SchemaResult(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class ServerAuthHandler(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def authenticate(self, outgoing, incoming) -> Any: ... def is_valid(self, token) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ServerAuthReader(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def read(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ServerAuthSender(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def write(self, message) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ServerCallContext(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def get_middleware(self, key) -> Any: ... def is_cancelled(self) -> Any: ... def peer(self) -> Any: ... @@ -537,25 +434,21 @@ class ServerCallContext(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class ServerMiddleware(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def call_completed(self, exception) -> Any: ... def sending_headers(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class ServerMiddlewareFactory(pyarrow.lib._Weakrefable): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def start_call(self, info, headers) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class SignalStopHandler: stop_token: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _init_signals(self) -> Any: ... def __enter__(self) -> Any: ... def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... @@ -563,7 +456,7 @@ class SignalStopHandler: def __setstate__(self, state) -> Any: ... class Ticket(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore ticket: Any def __init__(self, *args, **kwargs) -> None: ... @classmethod @@ -581,11 +474,10 @@ class Ticket(pyarrow.lib._Weakrefable): class TracingServerMiddleware(ServerMiddleware): __slots__: ClassVar[list] = ... trace_context: Any - def __init__(self, trace_context) -> Any: ... + def __init__(self, trace_context) -> None: ... class TracingServerMiddlewareFactory(ServerMiddlewareFactory): - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -632,7 +524,7 @@ class _CertKeyPair(tuple): class _FlightServerFinalizer(pyarrow.lib._Weakrefable): @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def finalize(self) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -642,7 +534,7 @@ class _MetadataRecordBatchReader( ): schema: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def read_all(self) -> Any: ... def read_chunk(self) -> Any: ... def to_reader(self) -> Any: ... @@ -660,7 +552,6 @@ class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): def __setstate__(self, state) -> Any: ... class _ServerMiddlewareWrapper(ServerMiddleware): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def call_completed(self, exception) -> Any: ... def sending_headers(self) -> Any: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 9dfd706d6eb..66355dfa34c 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -1,14 +1,14 @@ import abc import datetime import enum -import importlib._bootstrap +import importlib._bootstrap # type: ignore from typing import ( Any, + Callable, ClassVar, - overload, ) -import _abc +import _abc # type: ignore import pyarrow.lib Directory: importlib._bootstrap.FileType @@ -24,7 +24,6 @@ class ABC: __slots__: ClassVar[tuple] = ... class FileInfo(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... base_name: Any extension: Any is_file: Any @@ -38,7 +37,6 @@ class FileInfo(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FileSelector(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... allow_not_found: Any base_dir: Any recursive: Any @@ -47,8 +45,7 @@ class FileSelector(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FileSystem(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore type_name: Any def __init__(self, *args, **kwargs) -> None: ... def _wrap_input_stream(self, stream, path, compression, buffer_size) -> Any: ... @@ -59,37 +56,18 @@ class FileSystem(pyarrow.lib._Weakrefable): def delete_dir_contents(self, *args, **kwargs) -> Any: ... def delete_file(self, path) -> Any: ... def equals(self, FileSystemother) -> Any: ... - @overload - def from_uri(self, uri) -> Any: ... - @overload def from_uri(self, uri) -> Any: ... def get_file_info(self, paths_or_selector) -> Any: ... def move(self, src, dest) -> Any: ... def normalize_path(self, path) -> Any: ... - @overload def open_append_stream( self, path, compression=..., buffer_size=..., metadata=... ) -> Any: ... - @overload - def open_append_stream(self, path) -> Any: ... - @overload - def open_input_file(self, path) -> Any: ... - @overload - def open_input_file(self) -> Any: ... - @overload def open_input_file(self, path) -> Any: ... - @overload def open_input_stream(self, path, compression=..., buffer_size=...) -> Any: ... - @overload - def open_input_stream(self) -> Any: ... - @overload - def open_input_stream(self, path) -> Any: ... - @overload def open_output_stream( self, path, compression=..., buffer_size=..., metadata=... ) -> Any: ... - @overload - def open_output_stream(self, path) -> Any: ... def __eq__(self, other) -> Any: ... def __ge__(self, other) -> Any: ... def __gt__(self, other) -> Any: ... @@ -122,13 +100,7 @@ class FileType(enum.IntEnum): imag: Any numerator: Any real: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... - @overload - def as_integer_ratio(self) -> Any: ... - @overload - def as_integer_ratio(self) -> Any: ... - @overload + def __init__(self, *args, **kwargs) -> None: ... def as_integer_ratio(self) -> Any: ... def bit_count(self) -> Any: ... def bit_length(self) -> Any: ... @@ -184,38 +156,34 @@ class FileType(enum.IntEnum): def __truediv__(self, other) -> Any: ... def __trunc__(self) -> Any: ... def __xor__(self, other) -> Any: ... - __new__: ClassVar[function] = ... + __new__: ClassVar[Callable] = ... Directory: ClassVar[importlib._bootstrap.FileType] = ... File: ClassVar[importlib._bootstrap.FileType] = ... NotFound: ClassVar[importlib._bootstrap.FileType] = ... Unknown: ClassVar[importlib._bootstrap.FileType] = ... - _generate_next_value_: ClassVar[function] = ... + _generate_next_value_: ClassVar[Callable] = ... _member_map_: ClassVar[dict] = ... _member_names_: ClassVar[list] = ... _value2member_map_: ClassVar[dict] = ... class LocalFileSystem(FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... @classmethod def _reconstruct(cls, typecls, kwargs) -> Any: ... def __reduce__(self) -> Any: ... class PyFileSystem(FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... handler: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... class SubTreeFileSystem(FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... base_fs: Any base_path: Any def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... class _MockFileSystem(FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -224,8 +192,7 @@ class timezone(datetime.tzinfo): max: ClassVar[datetime.timezone] = ... min: ClassVar[datetime.timezone] = ... utc: ClassVar[datetime.timezone] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def dst(self, *args, **kwargs) -> Any: ... def fromutc(self, *args, **kwargs) -> Any: ... def tzname(self, *args, **kwargs) -> Any: ... diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi index 8aa85b089bf..8f1d95ebd6f 100644 --- a/pyarrow-stubs/_gcsfs.pyi +++ b/pyarrow-stubs/_gcsfs.pyi @@ -9,7 +9,6 @@ import pyarrow._fs import pyarrow.lib class GcsFileSystem(pyarrow._fs.FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... default_bucket_location: Any def __init__(self, *args, **kwargs) -> None: ... def _expiration_datetime_from_options(self) -> Any: ... @@ -18,8 +17,7 @@ class GcsFileSystem(pyarrow._fs.FileSystem): def __reduce__(self) -> Any: ... class KeyValueMetadata(pyarrow.lib._Metadata, collections.abc.Mapping): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore def __init__(self, *args, **kwargs) -> None: ... def equals(self, KeyValueMetadataother) -> Any: ... def get_all(self, key) -> Any: ... @@ -41,59 +39,6 @@ class KeyValueMetadata(pyarrow.lib._Metadata, collections.abc.Mapping): def __ne__(self, other) -> Any: ... def __reduce__(self) -> Any: ... -class datetime(datetime.date): - max: ClassVar[datetime.datetime] = ... - min: ClassVar[datetime.datetime] = ... - resolution: ClassVar[datetime.timedelta] = ... - fold: Any - hour: Any - microsecond: Any - minute: Any - second: Any - tzinfo: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... - def astimezone(self, *args, **kwargs) -> Any: ... - @classmethod - def combine(cls, *args, **kwargs) -> Any: ... - def ctime(self) -> Any: ... - def date(self, *args, **kwargs) -> Any: ... - def dst(self) -> Any: ... - @classmethod - def fromisoformat(cls, *args, **kwargs) -> Any: ... - @classmethod - def fromtimestamp(cls, *args, **kwargs) -> Any: ... - def isoformat(self, *args, **kwargs) -> Any: ... - @classmethod - def now(cls, *args, **kwargs) -> Any: ... - def replace(self, *args, **kwargs) -> Any: ... - @classmethod - def strptime(cls, *args, **kwargs) -> Any: ... - def time(self, *args, **kwargs) -> Any: ... - def timestamp(self, *args, **kwargs) -> Any: ... - def timetuple(self, *args, **kwargs) -> Any: ... - def timetz(self, *args, **kwargs) -> Any: ... - def tzname(self) -> Any: ... - @classmethod - def utcfromtimestamp(cls, *args, **kwargs) -> Any: ... - @classmethod - def utcnow(cls, *args, **kwargs) -> Any: ... - def utcoffset(self) -> Any: ... - def utctimetuple(self, *args, **kwargs) -> Any: ... - def __add__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __radd__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __reduce_ex__(self, protocol) -> Any: ... - def __rsub__(self, other) -> Any: ... - def __sub__(self, other) -> Any: ... - class timedelta: max: ClassVar[datetime.timedelta] = ... min: ClassVar[datetime.timedelta] = ... @@ -101,8 +46,7 @@ class timedelta: days: Any microseconds: Any seconds: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def total_seconds(self, *args, **kwargs) -> Any: ... def __abs__(self) -> Any: ... def __add__(self, other) -> Any: ... @@ -135,8 +79,7 @@ class timezone(datetime.tzinfo): max: ClassVar[datetime.timezone] = ... min: ClassVar[datetime.timezone] = ... utc: ClassVar[datetime.timezone] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def dst(self, *args, **kwargs) -> Any: ... def fromutc(self, *args, **kwargs) -> Any: ... def tzname(self, *args, **kwargs) -> Any: ... diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index 1d6fe663269..2f18da13f63 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -1,14 +1,13 @@ from typing import ( Any, - ClassVar, + Callable, ) import pyarrow._fs -_stringify_path: function +_stringify_path: Callable class HadoopFileSystem(pyarrow._fs.FileSystem): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... @classmethod def _reconstruct(cls, typecls, kwargs) -> Any: ... diff --git a/pyarrow-stubs/_hdfsio.pyi b/pyarrow-stubs/_hdfsio.pyi index c42a083282f..bbaac6e66d2 100644 --- a/pyarrow-stubs/_hdfsio.pyi +++ b/pyarrow-stubs/_hdfsio.pyi @@ -1,11 +1,11 @@ import re from typing import ( Any, - Literal, overload, ) import pyarrow.lib +from typing_extensions import Literal _HDFS_PATH_RE: re.Pattern @@ -17,7 +17,7 @@ class HadoopFileSystem(pyarrow.lib._Weakrefable): port: int user: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def chmod(self, path: str, mode: int) -> Any: ... def chown(self, path: str, owner: str = ..., group: str = ...) -> Any: ... def close(self) -> Any: ... @@ -53,16 +53,15 @@ class HadoopFileSystem(pyarrow.lib._Weakrefable): class HdfsFile(pyarrow.lib.NativeFile): buffer_size: int - mode: str parent: _HdfsFileNanny | None @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... class _HdfsFileNanny(pyarrow.lib._Weakrefable): @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... diff --git a/pyarrow-stubs/_orc.pyi b/pyarrow-stubs/_orc.pyi index 272d2c1db40..13c41bdb1bd 100644 --- a/pyarrow-stubs/_orc.pyi +++ b/pyarrow-stubs/_orc.pyi @@ -6,7 +6,7 @@ _stringify_path: function class ORCReader(pyarrow.lib._Weakrefable): @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def compression(self) -> Any: ... def compression_size(self) -> Any: ... def content_length(self) -> Any: ... @@ -33,7 +33,7 @@ class ORCReader(pyarrow.lib._Weakrefable): class ORCWriter(pyarrow.lib._Weakrefable): @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def close(self) -> Any: ... def open(self, *args, **kwargs) -> Any: ... def write(self, Tabletable) -> Any: ... diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 993da5f7cfb..75a2120f14d 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -2,11 +2,10 @@ from typing import ( Any, ClassVar, Generator, - Literal, ) import pyarrow.lib -from pyarrow.parquet.core import FileDecryptionProperties +from typing_extensions import Literal _stringify_path: function indent: function @@ -206,12 +205,12 @@ class ParquetWriter(pyarrow.lib._Weakrefable): ) -> None: ... class RowGroupMetaData(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... + __hash__: ClassVar[None] = ... # type: ignore num_columns: Any num_rows: Any total_byte_size: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def column(self, inti) -> Any: ... def equals(self, RowGroupMetaDataother) -> Any: ... def to_dict(self) -> Any: ... @@ -224,8 +223,7 @@ class RowGroupMetaData(pyarrow.lib._Weakrefable): def __reduce__(self) -> Any: ... class Statistics(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore converted_type: Any distinct_count: Any has_distinct_count: Any @@ -240,7 +238,7 @@ class Statistics(pyarrow.lib._Weakrefable): num_values: Any physical_type: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def equals(self, Statisticsother) -> Any: ... def to_dict(self) -> Any: ... def __eq__(self, other) -> Any: ... @@ -253,18 +251,13 @@ class Statistics(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class FileDecryptionProperties: - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class FileEncryptionProperties: - __pyx_vtable__: ClassVar[PyCapsule] = ... - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... class FileMetaData(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... - __pyx_vtable__: ClassVar[PyCapsule] = ... + __hash__: ClassVar[None] = ... # type: ignore created_by: Any format_version: Any metadata: Any @@ -273,8 +266,7 @@ class FileMetaData(pyarrow.lib._Weakrefable): num_rows: Any schema: Any serialized_size: Any - @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def append_row_groups(self, FileMetaDataother) -> Any: ... def equals(self, FileMetaDataother) -> Any: ... def row_group(self, inti) -> Any: ... diff --git a/pyarrow-stubs/_parquet_encryption.pyi b/pyarrow-stubs/_parquet_encryption.pyi index 971365611f8..0c654725d77 100644 --- a/pyarrow-stubs/_parquet_encryption.pyi +++ b/pyarrow-stubs/_parquet_encryption.pyi @@ -9,7 +9,6 @@ import pyarrow.lib class ArrowException(Exception): ... class CryptoFactory(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... def __init__(self, *args, **kwargs) -> None: ... def file_decryption_properties( @@ -28,7 +27,6 @@ class CryptoFactory(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class DecryptionConfiguration(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... cache_lifetime: Any def __init__(self, *args, **kwargs) -> None: ... @@ -36,7 +34,6 @@ class DecryptionConfiguration(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class EncryptionConfiguration(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... cache_lifetime: Any column_keys: Any @@ -51,7 +48,6 @@ class EncryptionConfiguration(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class KmsClient(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... def __init__(self, *args, **kwargs) -> None: ... def unwrap_key(self, wrapped_key, master_key_identifier) -> Any: ... def wrap_key(self, key_bytes, master_key_identifier) -> Any: ... @@ -59,7 +55,6 @@ class KmsClient(pyarrow.lib._Weakrefable): def __setstate__(self, state) -> Any: ... class KmsConnectionConfig(pyarrow.lib._Weakrefable): - __pyx_vtable__: ClassVar[PyCapsule] = ... __slots__: ClassVar[tuple] = ... custom_kms_conf: Any key_access_token: Any @@ -78,7 +73,7 @@ class timedelta: microseconds: Any seconds: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def total_seconds(self, *args, **kwargs) -> Any: ... def __abs__(self) -> Any: ... def __add__(self, other) -> Any: ... diff --git a/pyarrow-stubs/_plasma.pyi b/pyarrow-stubs/_plasma.pyi index 1e519691a3e..a85a750cd52 100644 --- a/pyarrow-stubs/_plasma.pyi +++ b/pyarrow-stubs/_plasma.pyi @@ -1,14 +1,14 @@ -import collections.abc import socket from typing import ( Any, - ClassVar, - Literal, - TypedDict, overload, ) import pyarrow.lib +from typing_extensions import ( + Literal, + TypedDict, +) PLASMA_WAIT_TIMEOUT: int diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi index 1084d839aca..f9baf0fec8e 100644 --- a/pyarrow-stubs/_s3fs.pyi +++ b/pyarrow-stubs/_s3fs.pyi @@ -1,5 +1,5 @@ import enum -import importlib._bootstrap +import importlib._bootstrap # type: ignore from typing import ( Any, ClassVar, diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi index 7052f9bbac6..480162706aa 100644 --- a/pyarrow-stubs/feather.pyi +++ b/pyarrow-stubs/feather.pyi @@ -1,8 +1,5 @@ from io import IOBase -from typing import ( - Literal, - overload, -) +from typing import overload import pandas as pd from pyarrow._feather import FeatherError as FeatherError @@ -16,6 +13,7 @@ from pyarrow.lib import ( schema as schema, ) from pyarrow.vendored.version import Version as Version +from typing_extensions import Literal class FeatherDataset: paths: list[str] diff --git a/pyarrow-stubs/filesystem.pyi b/pyarrow-stubs/filesystem.pyi index ba4295305b9..41841ebec88 100644 --- a/pyarrow-stubs/filesystem.pyi +++ b/pyarrow-stubs/filesystem.pyi @@ -25,8 +25,8 @@ class FileSystem: self, path: str, columns: list[str] | None = ..., - metadata: parquet.FileMetaData | None = ..., - schema: parquet.ParquetSchema | None = ..., + metadata: parquet.FileMetaData | None = ..., # type: ignore + schema: parquet.ParquetSchema | None = ..., # type: ignore use_threads: bool = ..., use_pandas_metadata: bool = ..., ) -> Table: ... diff --git a/pyarrow-stubs/hdfs.pyi b/pyarrow-stubs/hdfs.pyi index 72481ac3e0b..b2800777f09 100644 --- a/pyarrow-stubs/hdfs.pyi +++ b/pyarrow-stubs/hdfs.pyi @@ -5,7 +5,7 @@ import pyarrow._hdfsio as _hdfsio from pyarrow.filesystem import FileSystem as FileSystem from pyarrow.util import implements as implements -class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): +class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): # type: ignore def __init__( self, host: str = ..., diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index bed60d912af..e2c5335a85c 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -2,7 +2,7 @@ import collections.abc import datetime as dt from decimal import Decimal import enum -import importlib._bootstrap +import importlib._bootstrap # type: ignore import io from os import PathLike from types import ModuleType @@ -101,6 +101,7 @@ cpp_version: str cpp_version_info: importlib._bootstrap.VersionInfo have_signal_refcycle: bool namedtuple: Callable +builtin_pickle: Callable class PyCapsule: ... @@ -472,7 +473,7 @@ class BaseExtensionType(DataType[_T]): ) -> ChunkedArray[_T, ExtensionScalar[_T]]: ... class BaseListArray(Array[list[_T], _Scalar]): - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def flatten(self) -> Array[_T, _Scalar]: ... def value_lengths(self) -> Int32Array: ... def value_parent_indices(self) -> Int64Array: ... @@ -709,7 +710,7 @@ class DictionaryArray(Array[dict, DictionaryScalar]): dictionary: Any indices: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def dictionary_decode(self: _Array) -> _Array: ... def dictionary_encode(self) -> DictionaryArray: ... # type: ignore @staticmethod @@ -1196,7 +1197,7 @@ class RecordBatch(_PandasConvertible): class RecordBatchReader(_Weakrefable): schema: Schema - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _export_to_c(self, out_ptr: int) -> None: ... @staticmethod def _import_from_c(in_ptr: int) -> RecordBatchReader: ... @@ -1865,7 +1866,7 @@ class _RecordBatchFileReader(_Weakrefable): schema: Any stats: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _open( self, source, @@ -1884,7 +1885,7 @@ class _RecordBatchFileReader(_Weakrefable): class _RecordBatchFileWriter(_RecordBatchStreamWriter): @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -1899,7 +1900,7 @@ class _RecordBatchStreamWriter(_CRecordBatchWriter): _metadata_version: Any _use_legacy_format: Any @classmethod - def __init__(cls, *args, **kwargs) -> None: ... + def __init__(self, *args, **kwargs) -> None: ... def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -1916,7 +1917,7 @@ class _WriteStats(NamedTuple): class ordered_dict: def __init__(self, *args, **kwargs) -> None: ... def clear(self, *args, **kwargs) -> Any: ... - def copy(self) -> ashallowcopyofD: ... # noqa + def copy(self) -> dict: ... @classmethod def fromkeys(cls, *args, **kwargs) -> Any: ... def get(self, *args, **kwargs) -> Any: ... diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index b68e85ef179..472e318408a 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,7 +1,9 @@ from io import IOBase -from _fs import FileSystem -from pyarrow import _orc +from pyarrow._orc import ( + ORCReader as _ORCReader, + ORCWriter as _ORCWriter, +) from pyarrow.lib import ( KeyValueMetadata, NativeFile, @@ -10,8 +12,10 @@ from pyarrow.lib import ( Table, ) +from ._fs import FileSystem + class ORCFile: - reader: _orc.ORCReader + reader: _ORCReader def __init__(self, source: str | NativeFile | IOBase) -> None: ... @property def metadata(self) -> KeyValueMetadata: ... @@ -53,7 +57,7 @@ class ORCFile: class ORCWriter: __doc__: str is_open: bool - writer: _orc.ORCWriter + writer: _ORCWriter def __init__( self, where: str | NativeFile | IOBase, diff --git a/pyarrow-stubs/serialization.pyi b/pyarrow-stubs/serialization.pyi index 1b992aebdd7..677cb9f5249 100644 --- a/pyarrow-stubs/serialization.pyi +++ b/pyarrow-stubs/serialization.pyi @@ -5,7 +5,7 @@ from pyarrow.lib import ( ) try: - import cloudpickle + import cloudpickle # type: ignore except ImportError: cloudpickle = builtin_pickle diff --git a/pyproject.toml b/pyproject.toml index e5129cd0a72..462ae0aac46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,3 +49,6 @@ profile = "black" combine_as_imports = true force_grid_wrap = 2 force_sort_within_sections = true + +[tool.mypy] +python_version = 3.7 From a182e010eab2a7c972868990f0ef038a5115796c Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 17:55:10 +0800 Subject: [PATCH 019/231] bump 10.0.1.6 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 462ae0aac46..6a6fdec3907 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.5" +version = "10.0.1.6" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From 45909ac484a5280fca071f5e402eaf65fd1ac672 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Jan 2023 17:56:11 +0800 Subject: [PATCH 020/231] fix ci name --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a095f5b46ff..343fd557bfb 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,4 +1,4 @@ -name: Release VSCode Server Bin +name: Release on: push: From d2dce339ad270594cd14567344b8e55f516fe5c5 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 12 Dec 2023 13:43:11 +0800 Subject: [PATCH 021/231] Remove version restrictions for Python. --- pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a6fdec3907..d2b6069529f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,20 +19,19 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering", ] packages = [{ include = "pyarrow-stubs" }] [tool.poetry.dependencies] -python = "^3.7,<3.11" +python = "^3.7" [tool.poetry.dev-dependencies] black = ">=22.12.0" isort = ">=5.10.1" -numpy = "1.21.4" mypy = "^0.991" pre-commit = ">=2.19.0" -pyarrow = "10.0.1" pyright = ">=1.1.284" pytest = ">=7.1.2" typing-extensions = ">=4.2.0" From 736a6ea41084548e9e6cee540b970ec1f3b25b4e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 12 Dec 2023 13:45:40 +0800 Subject: [PATCH 022/231] release 10.0.1.7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d2b6069529f..ddbd815528c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.6" +version = "10.0.1.7" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From ecfdb8c7c435f72a21adff04fd2033ce7eba2e97 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 12 Dec 2023 13:48:44 +0800 Subject: [PATCH 023/231] update poetry ci --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 343fd557bfb..de0d87b1833 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -16,7 +16,7 @@ jobs: python-version: "3.7" - uses: abatilo/actions-poetry@v2 with: - poetry-version: "1.1.15" + poetry-version: "1.4.2" - name: publish run: | poetry build -f wheel From c630066e9e513cde3c64c0d6b820848eaeeca2e0 Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Wed, 13 Dec 2023 18:38:59 -0500 Subject: [PATCH 024/231] Fix stubs for Table factory methods The main problem was that these were annotated as instance methods rather than static/class methods, but I've added some detail, too. --- pyarrow-stubs/lib.pyi | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index e2c5335a85c..8f438f3e03c 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1585,8 +1585,10 @@ class Table(_PandasConvertible): null_selection_behavior: Literal["drop", "emit_null"] = ..., ) -> _Self: ... def flatten(self, memory_pool: MemoryPool | None = ...) -> Table: ... - def from_arrays(self, arrays, names=..., schema=..., metadata=...) -> Any: ... - def from_batches(self, batches, Schemaschema=...) -> Any: ... + @staticmethod + def from_arrays(arrays: list[Array], names: list[str] | None = None, schema: Schema | None = None, metadata: dict[str | bytes, str | bytes] | None = None) -> Table: ... + @staticmethod + def from_batches(batches: collections.abc.Iterable[RecordBatch], schema: Schema | None = None) -> Table: ... @classmethod def from_pandas( cls, From 8d5a1e0bfe2fc858fe71ac884031283ffee5e2a1 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 11:48:30 +0800 Subject: [PATCH 025/231] update pre-commit --- .gitignore | 2 +- .pre-commit-config.yaml | 64 ++++++++++++------------------ check-mypy.sh | 2 +- pyarrow-stubs/_dataset_orc.pyi | 1 - pyarrow-stubs/_dataset_parquet.pyi | 1 - pyarrow-stubs/_exec_plan.pyi | 1 - pyarrow-stubs/_fs.pyi | 1 + pyarrow-stubs/lib.pyi | 11 ++++- pyarrow-stubs/parquet/core.pyi | 2 +- pyproject.toml | 23 +++++------ 10 files changed, 48 insertions(+), 60 deletions(-) diff --git a/.gitignore b/.gitignore index f47c62c6efb..932deb9a12a 100644 --- a/.gitignore +++ b/.gitignore @@ -134,4 +134,4 @@ dmypy.json # Pyre type checker .pyre/ /poetry.lock -.idea/**/* \ No newline at end of file +.idea/**/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e0e0eac750c..80783980ce1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,31 @@ -minimum_pre_commit_version: 2.15.0 -default_language_version: - python: python3.7 ci: autofix_prs: false + +default_language_version: + python: python3.11 + repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-merge-conflict + - id: check-case-conflict + - id: check-toml + - id: check-yaml + - id: check-ast + - id: debug-statements + exclude: sunray/_internal/rpdb.py + - id: check-docstring-first + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.4 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + - repo: local hooks: - id: mypy @@ -13,41 +35,7 @@ repos: types_or: [python, pyi] require_serial: true additional_dependencies: - - mypy + - mypy==1.11.0 - types-cffi - numpy - pandas-stubs - - repo: https://github.com/python/black - rev: 22.12.0 - hooks: - - id: black - - repo: https://github.com/PyCQA/isort - rev: 5.11.2 - hooks: - - id: isort - - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 - hooks: - - id: pyupgrade - types_or: [python, pyi] - types: [text] # overwrite types: [python] - args: [--py37-plus] - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - name: flake8 (py) - types: [python] - args: [--ignore=E501 F841] - - id: flake8 - name: flake8 (pyi) - additional_dependencies: - - flake8-pyi==22.11.0 - types: [pyi] - args: [ - --ignore=E301 E302 E305 E402 E501 E701 E704 F401 F811 W503 Y019 Y027 Y034 Y037 Y041 Y042, - # TypeVars in private files are already private - --per-file-ignores=_*.pyi:Y001, - # ignore private stub files - --per-file-ignores=_*.pyi:F821, - ] diff --git a/check-mypy.sh b/check-mypy.sh index 74ba4bb1e9d..25fae94bd73 100755 --- a/check-mypy.sh +++ b/check-mypy.sh @@ -1,2 +1,2 @@ #! /bin/bash -mypy pyarrow-stubs \ No newline at end of file +mypy pyarrow-stubs diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi index 8154e3fd998..5a4dcaec22e 100644 --- a/pyarrow-stubs/_dataset_orc.pyi +++ b/pyarrow-stubs/_dataset_orc.pyi @@ -1,6 +1,5 @@ from typing import ( Any, - ClassVar, ) import pyarrow._dataset diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index e9ca579d68c..e6b17d5b139 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -56,7 +56,6 @@ class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): def __setstate__(self, state) -> Any: ... class ParquetFragmentScanOptions(pyarrow._dataset.FragmentScanOptions): - __slots__: ClassVar[tuple] = ... buffer_size: Any pre_buffer: Any diff --git a/pyarrow-stubs/_exec_plan.pyi b/pyarrow-stubs/_exec_plan.pyi index e2208ff02d5..b06dadfab4b 100644 --- a/pyarrow-stubs/_exec_plan.pyi +++ b/pyarrow-stubs/_exec_plan.pyi @@ -1,6 +1,5 @@ from typing import ( Any, - ClassVar, ) import pyarrow._dataset diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 66355dfa34c..c07cb55e056 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -156,6 +156,7 @@ class FileType(enum.IntEnum): def __truediv__(self, other) -> Any: ... def __trunc__(self) -> Any: ... def __xor__(self, other) -> Any: ... + __new__: ClassVar[Callable] = ... Directory: ClassVar[importlib._bootstrap.FileType] = ... File: ClassVar[importlib._bootstrap.FileType] = ... diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 8f438f3e03c..74cb06ec443 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1586,9 +1586,16 @@ class Table(_PandasConvertible): ) -> _Self: ... def flatten(self, memory_pool: MemoryPool | None = ...) -> Table: ... @staticmethod - def from_arrays(arrays: list[Array], names: list[str] | None = None, schema: Schema | None = None, metadata: dict[str | bytes, str | bytes] | None = None) -> Table: ... + def from_arrays( + arrays: list[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: dict[str | bytes, str | bytes] | None = None, + ) -> Table: ... @staticmethod - def from_batches(batches: collections.abc.Iterable[RecordBatch], schema: Schema | None = None) -> Table: ... + def from_batches( + batches: collections.abc.Iterable[RecordBatch], schema: Schema | None = None + ) -> Table: ... @classmethod def from_pandas( cls, diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index b52203ef8e8..7ba75582b90 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -38,7 +38,7 @@ from typing_extensions import ( ) def filters_to_expression( - filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] + filters: list[tuple[str, str, str] | list[tuple[str, str, str]]], ) -> Expression: ... class ParquetFile: diff --git a/pyproject.toml b/pyproject.toml index ddbd815528c..6e7ca02b8e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,26 +28,21 @@ packages = [{ include = "pyarrow-stubs" }] python = "^3.7" [tool.poetry.dev-dependencies] -black = ">=22.12.0" -isort = ">=5.10.1" mypy = "^0.991" pre-commit = ">=2.19.0" -pyright = ">=1.1.284" -pytest = ">=7.1.2" typing-extensions = ">=4.2.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" -[tool.black] -target-version = ['py37'] +[tool.ruff] +fix = true +line-length = 88 +target-version = "py37" -[tool.isort] -profile = "black" -combine_as_imports = true -force_grid_wrap = 2 -force_sort_within_sections = true - -[tool.mypy] -python_version = 3.7 +[tool.ruff.lint] +ignore = [ + "F811", # redefined-while-unused + "F821", # undefined-name +] From 13b887d7506b2d8fac32c33e6326cae3a97760bb Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 11:52:08 +0800 Subject: [PATCH 026/231] update --- .pre-commit-config.yaml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 80783980ce1..168c37a7250 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,16 +26,11 @@ repos: args: [--fix] - id: ruff-format - - repo: local + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.11.0 hooks: - id: mypy - name: mypy - entry: ./check-mypy.sh - language: python - types_or: [python, pyi] - require_serial: true additional_dependencies: - - mypy==1.11.0 - types-cffi - numpy - pandas-stubs From e6c7193e7d844c9afe9eb947bedfacc4cc1d45a2 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 11:58:31 +0800 Subject: [PATCH 027/231] fix: make fs.FileSystem.from_uri and hdfs.HadoopFileSystem.from_uri as classmethod --- pyarrow-stubs/_fs.pyi | 4 +++- pyarrow-stubs/_hdfs.pyi | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 66355dfa34c..6042c6641f3 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -56,7 +56,8 @@ class FileSystem(pyarrow.lib._Weakrefable): def delete_dir_contents(self, *args, **kwargs) -> Any: ... def delete_file(self, path) -> Any: ... def equals(self, FileSystemother) -> Any: ... - def from_uri(self, uri) -> Any: ... + @classmethod + def from_uri(cls, uri) -> FileSystem: ... def get_file_info(self, paths_or_selector) -> Any: ... def move(self, src, dest) -> Any: ... def normalize_path(self, path) -> Any: ... @@ -156,6 +157,7 @@ class FileType(enum.IntEnum): def __truediv__(self, other) -> Any: ... def __trunc__(self) -> Any: ... def __xor__(self, other) -> Any: ... + __new__: ClassVar[Callable] = ... Directory: ClassVar[importlib._bootstrap.FileType] = ... File: ClassVar[importlib._bootstrap.FileType] = ... diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index 2f18da13f63..39853b2f9ed 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -11,7 +11,8 @@ class HadoopFileSystem(pyarrow._fs.FileSystem): def __init__(self, *args, **kwargs) -> None: ... @classmethod def _reconstruct(cls, typecls, kwargs) -> Any: ... - def from_uri(self, uri) -> Any: ... + @classmethod + def from_uri(cls, uri) -> HadoopFileSystem: ... def __reduce__(self) -> Any: ... def frombytes(*args, **kwargs) -> Any: ... From cc84ec96a752d2990429c23dddfafd33ded8327a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 12:05:47 +0800 Subject: [PATCH 028/231] fix: fix read_metadata and read_schema wrong annotations (#11) --- pyarrow-stubs/parquet/core.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 7ba75582b90..d6f9cb071a1 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -400,13 +400,13 @@ def read_metadata( memory_map: bool = ..., decryption_properties: FileDecryptionProperties | None = ..., filesystem: Incomplete | None = ..., -): ... +) -> FileMetaData: ... def read_schema( where: str | IOBase, memory_map: bool = ..., decryption_properties: FileDecryptionProperties | None = ..., filesystem: FileSystem | None = ..., -) -> FileMetaData: ... +) -> Schema: ... # Names in __all__ with no definition: # _filters_to_expression From 0f5faf34c868ed5dc3aa0b2059e325648b6c5ff2 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 12:08:11 +0800 Subject: [PATCH 029/231] fix: typo S3FileSystem schema -> scheme (#12) --- pyarrow-stubs/_s3fs.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi index f9baf0fec8e..2dc1018a535 100644 --- a/pyarrow-stubs/_s3fs.pyi +++ b/pyarrow-stubs/_s3fs.pyi @@ -35,7 +35,7 @@ class S3FileSystem(pyarrow._fs.FileSystem): region: str = ..., request_timeout: float | None = ..., connect_timeout: float | None = ..., - schema: str = ..., + scheme: str = ..., endpoint_override: str | None = ..., background_writes: bool = ..., default_metadata: dict | pyarrow.lib.KeyValueMetadata = ..., From bf7ba5d086504752b76cb1ef32cd7d6dd7cb138a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 26 Jul 2024 12:10:05 +0800 Subject: [PATCH 030/231] bump version 10.0.1.8 (#13) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6e7ca02b8e5..4f00767876d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.7" +version = "10.0.1.8" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From 55eaf9a7fd4973d84d160b5cd6d33722f2369b82 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sat, 27 Jul 2024 19:00:49 +1000 Subject: [PATCH 031/231] . (#16) --- pyarrow-stubs/lib.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 74cb06ec443..0e9e55bcaf1 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -20,6 +20,7 @@ from typing import ( ValuesView, overload, ) +from typing_extensions import Buffer as _Buffer import _io # type: ignore import numpy as np @@ -491,7 +492,7 @@ class BooleanArray(Array[bool, BooleanScalar]): class BooleanScalar(Scalar[bool]): ... -class Buffer(_Weakrefable): +class Buffer(_Weakrefable, _Buffer): address: int is_cpu: bool is_mutable: bool From 75559585d34b6c9097717d64daef19a04927cd3c Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sat, 27 Jul 2024 21:22:44 +1000 Subject: [PATCH 032/231] make DataType hashable (#22) --- pyarrow-stubs/lib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 0e9e55bcaf1..b2450ad595d 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -683,6 +683,7 @@ class DataType(_Weakrefable, Generic[_T]): def field(self, i: int) -> Field: ... def to_pandas_dtype(self) -> DTypeLike: ... def __eq__(self, other) -> bool: ... + def __hash__(self) -> int: ... class Date32Array(NumericArray[dt.date, Date32Scalar]): ... class Date32Scalar(Scalar[dt.date]): ... From 83481e07f79b1952e170934c953d28efb3af3b53 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sat, 27 Jul 2024 21:22:58 +1000 Subject: [PATCH 033/231] pa.table support recordbatch (#20) --- pyarrow-stubs/lib.pyi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index b2450ad595d..d67803c4fc8 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -2207,6 +2207,10 @@ def table( df: pd.DataFrame, schema: Schema | None = ..., nthreads: int | None = ... ) -> Table: ... @overload +def table( + data: RecordBatch, schema: Schema | None = ..., nthreads: int | None = ... +) -> Table: ... +@overload def table( arrays: list[Array], schema: Schema, From 4bff9956952c6a4365d128a5f1c4690cfe9e9756 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sat, 27 Jul 2024 21:23:11 +1000 Subject: [PATCH 034/231] RecordBatchStreamReader supports next (#18) --- pyarrow-stubs/lib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index d67803c4fc8..d1e703a1ac8 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1906,6 +1906,7 @@ class _RecordBatchStreamReader(RecordBatchReader): def _open( self, source, IpcReadOptionsoptions=..., MemoryPoolmemory_pool=... ) -> Any: ... + def __next__(self) -> RecordBatch: ... class _RecordBatchStreamWriter(_CRecordBatchWriter): _metadata_version: Any From 9c44c01443b0eb48176524a2e5d7bbf85b9c0453 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sat, 27 Jul 2024 22:53:50 +1000 Subject: [PATCH 035/231] add RecordBatch.to_pylist (#23) --- pyarrow-stubs/lib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index d1e703a1ac8..e300dcb7940 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1187,6 +1187,7 @@ class RecordBatch(_PandasConvertible): | NDArray[np.signedinteger | np.unsignedinteger], ) -> _Self: ... def to_pydict(self) -> dict[str, list]: ... + def to_pylist(self) -> list[dict[str, Any]]: ... def to_string(self, show_metadata: bool = ...) -> str: ... def validate(self, *, full: bool = ...) -> None: ... def __eq__(self, other) -> bool: ... From 44bd660e019a478470090b629ae357f9ff654fcd Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Sun, 28 Jul 2024 11:32:13 +1000 Subject: [PATCH 036/231] precise return types for to_pandas (#25) --- pyarrow-stubs/lib.pyi | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index e300dcb7940..60d0b53aee3 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -115,7 +115,7 @@ _T = TypeVar("_T") _T2 = TypeVar("_T2") _Scalar = TypeVar("_Scalar", bound=Scalar) -class Array(_PandasConvertible, Generic[_T, _Scalar]): +class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): _name: Any nbytes: int null_count: int @@ -540,7 +540,7 @@ class BuildInfo(NamedTuple): version: str version_info: str -class ChunkedArray(_PandasConvertible, Generic[_T, _Scalar]): +class ChunkedArray(_PandasConvertibleToSeries, Generic[_T, _Scalar]): _name: str | None chunks: list[Array[_T, _Scalar]] nbytes: int @@ -1110,7 +1110,7 @@ class PythonFile(NativeFile): class ReadStats(importlib._bootstrap.ReadStats): ... -class RecordBatch(_PandasConvertible): +class RecordBatch(_PandasConvertibleToDataFrame): columns: list[Array] nbytes: int num_columns: int @@ -1554,7 +1554,7 @@ class StructType(DataType): def __iter__(self) -> Generator[Field, None, None]: ... def __len__(self) -> int: ... -class Table(_PandasConvertible): +class Table(_PandasConvertibleToDataFrame): column_names: list[str] columns: list[Array] nbytes: int @@ -1843,7 +1843,7 @@ class _PandasAPIShim: def pandas_dtype(self, dtype: DTypeLike) -> DTypeLike: ... def series(self, *args, **kwargs) -> pd.Series: ... -class _PandasConvertible(_Weakrefable): +class _PandasConvertibleToDataFrame(_Weakrefable): def to_pandas( self, memory_pool: MemoryPool | None = ..., @@ -1861,7 +1861,27 @@ class _PandasConvertible(_Weakrefable): self_destruct: bool | None = ..., types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] | None = ..., - ) -> pd.Series | pd.DataFrame: ... + ) -> pd.DataFrame: ... + +class _PandasConvertibleToSeries(_Weakrefable): + def to_pandas( + self, + memory_pool: MemoryPool | None = ..., + categories: list[pd.Categorical] | None = ..., + strings_to_categorical: bool | None = ..., + zero_copy_only: bool | None = ..., + integer_object_nulls: bool | None = ..., + date_as_object: bool | None = ..., + timestamp_as_object: bool | None = ..., + use_threads: bool | None = ..., + deduplicate_objects: bool | None = ..., + ignore_metadata: bool | None = ..., + safe: bool | None = ..., + split_blocks: bool | None = ..., + self_destruct: bool | None = ..., + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] + | None = ..., + ) -> pd.Series: ... class _ReadPandasMixin: def read_pandas(self, **options) -> Any: ... From b7a6c564a8ed70b91a4416e85f368b25b111f7fb Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 29 Jul 2024 10:15:27 +0800 Subject: [PATCH 037/231] bump version 10.0.1.9 (#26) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4f00767876d..1cc508c3b04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyarrow-stubs" -version = "10.0.1.8" +version = "10.0.1.9" description = "Type annotations for pyarrow" authors = ["ZhengYu, Xu "] license = "BSD-2-Clause" From d006ba766348ea75918f7f7df395c3ce521ee011 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:17:54 +0800 Subject: [PATCH 038/231] [pre-commit.ci] pre-commit autoupdate (#27) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 168c37a7250..c923058a1f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.4 + rev: v0.5.5 hooks: - id: ruff args: [--fix] From d71693a20509d5a5fa96439fe597dc44719bea45 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 14:20:21 +0800 Subject: [PATCH 039/231] [pre-commit.ci] pre-commit autoupdate (#28) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c923058a1f8..195bfc79042 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,14 +20,14 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.5.6 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.0 + rev: v1.11.1 hooks: - id: mypy additional_dependencies: From 7e2078e901fadf990d893d9be316353441dd1655 Mon Sep 17 00:00:00 2001 From: Eugene Toder Date: Thu, 8 Aug 2024 22:33:06 -0400 Subject: [PATCH 040/231] Fix types in FlightDescriptor class (#29) * Fix types in FlightDescriptor class * Add argument types --- pyarrow-stubs/_flight.pyi | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index e6a5d1e5027..01bb7b23857 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -173,15 +173,17 @@ class FlightDataStream(pyarrow.lib._Weakrefable): class FlightDescriptor(pyarrow.lib._Weakrefable): __hash__: ClassVar[None] = ... # type: ignore - command: Any - descriptor_type: Any - path: Any + command: bytes | None + descriptor_type: DescriptorType + path: list[bytes] | None def __init__(self, *args, **kwargs) -> None: ... @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def for_command(self, command) -> Any: ... - def for_path(self, *path) -> Any: ... - def serialize(self) -> Any: ... + def deserialize(cls, serialized: str | bytes) -> FlightDescriptor: ... + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: ... + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: ... + def serialize(self) -> bytes: ... def __eq__(self, other) -> Any: ... def __ge__(self, other) -> Any: ... def __gt__(self, other) -> Any: ... From b75064036e6bdfb9aea98caec41734320a5d8b23 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 Aug 2024 21:00:44 +0800 Subject: [PATCH 041/231] chore: update pre-commit config (#30) --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 195bfc79042..7d7dc94f285 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,6 @@ repos: - id: check-yaml - id: check-ast - id: debug-statements - exclude: sunray/_internal/rpdb.py - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit From 0be92647607de664f3b8608debfbfd7e0601b5e7 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 Aug 2024 22:36:24 +0800 Subject: [PATCH 042/231] build: use `pixi` to manage project (#31) --- .gitignore | 1 + pixi.lock | 1335 +++++++++++++++++++++++++ pyarrow-stubs/__init__.pyi | 512 +++++----- pyarrow-stubs/_compute.pyi | 25 +- pyarrow-stubs/_csv.pyi | 12 +- pyarrow-stubs/_dataset.pyi | 13 +- pyarrow-stubs/_dataset_orc.pyi | 4 +- pyarrow-stubs/_dataset_parquet.pyi | 14 +- pyarrow-stubs/_exec_plan.pyi | 4 +- pyarrow-stubs/_flight.pyi | 72 +- pyarrow-stubs/_fs.pyi | 19 +- pyarrow-stubs/_gcsfs.pyi | 7 +- pyarrow-stubs/_hdfs.pyi | 6 +- pyarrow-stubs/_hdfsio.pyi | 8 +- pyarrow-stubs/_json.pyi | 6 +- pyarrow-stubs/_parquet.pyi | 17 +- pyarrow-stubs/_parquet_encryption.pyi | 7 +- pyarrow-stubs/_plasma.pyi | 26 +- pyarrow-stubs/_s3fs.pyi | 7 +- pyarrow-stubs/_substrait.pyi | 14 +- pyarrow-stubs/compute.pyi | 142 ++- pyarrow-stubs/csv.pyi | 24 +- pyarrow-stubs/cuda.pyi | 22 +- pyarrow-stubs/dataset.pyi | 108 +- pyarrow-stubs/feather.pyi | 21 +- pyarrow-stubs/filesystem.pyi | 10 +- pyarrow-stubs/flight.pyi | 92 +- pyarrow-stubs/fs.pyi | 36 +- pyarrow-stubs/hdfs.pyi | 7 +- pyarrow-stubs/ipc.pyi | 47 +- pyarrow-stubs/json.pyi | 8 +- pyarrow-stubs/jvm.pyi | 12 +- pyarrow-stubs/lib.pyi | 221 ++-- pyarrow-stubs/orc.pyi | 18 +- pyarrow-stubs/pandas_compat.pyi | 21 +- pyarrow-stubs/parquet/core.pyi | 64 +- pyarrow-stubs/parquet/encryption.pyi | 12 +- pyarrow-stubs/plasma.pyi | 18 +- pyarrow-stubs/serialization.pyi | 8 +- pyarrow-stubs/substrait.pyi | 6 +- pyarrow-stubs/types.pyi | 10 +- pyarrow-stubs/util.pyi | 6 +- pyproject.toml | 66 +- 43 files changed, 2094 insertions(+), 994 deletions(-) create mode 100644 pixi.lock diff --git a/.gitignore b/.gitignore index 932deb9a12a..e3f1b4ea3b2 100644 --- a/.gitignore +++ b/.gitignore @@ -109,6 +109,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.pixi/ # Spyder project settings .spyderproject diff --git a/pixi.lock b/pixi.lock new file mode 100644 index 00000000000..1d360d255cf --- /dev/null +++ b/pixi.lock @@ -0,0 +1,1335 @@ +version: 5 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-hf3520f5_7.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.2-h59595ed_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.1.0-h77fa898_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.0-hde9e2c9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.5-h2ad013b_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1c/21/a6b46c91b4c9d1918ee59c305f46850cde7cbea748635a352e7c3c8ed204/mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: . + osx-64: + - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.7.4-h8857fd0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.2-h73e2aa4_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.0-h1b8f9f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-h87427d6_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-h5846eda_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.1-h87427d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.12.5-h37a9e06_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3a/34/69638cee2e87303f19a0c35e80d42757e14d9aba328f272fdcdc0bf3c9b8/mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/a4/10/1be32aeaab8728f78f673e7a47dd813222364479b2d6573dbcf0085e83ea/ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: . + osx-arm64: + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.7.4-hf0a4a13_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.6.2-hebf3989_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.0-hfb93653_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-hfb2fe0b_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-hb89a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.1-hfb2fe0b_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.5-h30c5eda_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c4/3c/3e0611348fc53a4a7c80485959478b4f6eae706baf3b7c03cafa22639216/mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/3d/1d/c218ce83beb4394ba04d05e9aa2ae6ce9fba8405688fe878b0fdb40ce855/ruff-0.5.7-py3-none-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: . + win-64: + - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.7.4-h56e8100_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.6.2-h63175ca_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.0-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.1-h2466b09_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.12.5-h889d299_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h8a93ad2_20.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1e/b7/3a50f318979c8c541428c2f1ee973cda813bcc89614de982dafdd0df2b3e/mypy-1.11.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/67/1c/4520c98bfc06b9c73cd1457686d4d3935d40046b1ddea08403e5a6deff51/ruff-0.5.7-py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: . +packages: +- kind: conda + name: _libgcc_mutex + version: '0.1' + build: conda_forge + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 + md5: d7c89558ba9fa0495403155b64376d81 + license: None + purls: [] + size: 2562 + timestamp: 1578324546067 +- kind: conda + name: _openmp_mutex + version: '4.5' + build: 2_gnu + build_number: 16 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 + md5: 73aaf86a425cc6e73fcf236a5a46396d + depends: + - _libgcc_mutex 0.1 conda_forge + - libgomp >=7.5.0 + constrains: + - openmp_impl 9999 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 23621 + timestamp: 1650670423406 +- kind: conda + name: bzip2 + version: 1.0.8 + build: h2466b09_7 + build_number: 7 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda + sha256: 35a5dad92e88fdd7fc405e864ec239486f4f31eec229e31686e61a140a8e573b + md5: 276e7ffe9ffe39688abc665ef0f45596 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 54927 + timestamp: 1720974860185 +- kind: conda + name: bzip2 + version: 1.0.8 + build: h4bc722e_7 + build_number: 7 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d + md5: 62ee74e96c5ebb0af99386de58cf9553 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc-ng >=12 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 252783 + timestamp: 1720974456583 +- kind: conda + name: bzip2 + version: 1.0.8 + build: h99b78c6_7 + build_number: 7 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda + sha256: adfa71f158cbd872a36394c56c3568e6034aa55c623634b37a4836bd036e6b91 + md5: fc6948412dbbbe9a4c9ddbbcfe0a79ab + depends: + - __osx >=11.0 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 122909 + timestamp: 1720974522888 +- kind: conda + name: bzip2 + version: 1.0.8 + build: hfdf4475_7 + build_number: 7 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda + sha256: cad153608b81fb24fc8c509357daa9ae4e49dfc535b2cb49b91e23dbd68fc3c5 + md5: 7ed4301d437b59045be7e051a0308211 + depends: + - __osx >=10.13 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 134188 + timestamp: 1720974491916 +- kind: conda + name: ca-certificates + version: 2024.7.4 + build: h56e8100_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.7.4-h56e8100_0.conda + sha256: 7f37bb33c7954de1b4d19ad622859feb4f6c58f751c38b895524cad4e44af72e + md5: 9caa97c9504072cd060cf0a3142cc0ed + license: ISC + purls: [] + size: 154943 + timestamp: 1720077592592 +- kind: conda + name: ca-certificates + version: 2024.7.4 + build: h8857fd0_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.7.4-h8857fd0_0.conda + sha256: d16f46c489cb3192305c7d25b795333c5fc17bb0986de20598ed519f8c9cc9e4 + md5: 7df874a4b05b2d2b82826190170eaa0f + license: ISC + purls: [] + size: 154473 + timestamp: 1720077510541 +- kind: conda + name: ca-certificates + version: 2024.7.4 + build: hbcca054_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda + sha256: c1548a3235376f464f9931850b64b02492f379b2f2bb98bc786055329b080446 + md5: 23ab7665c5f63cfb9f1f6195256daac6 + license: ISC + purls: [] + size: 154853 + timestamp: 1720077432978 +- kind: conda + name: ca-certificates + version: 2024.7.4 + build: hf0a4a13_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.7.4-hf0a4a13_0.conda + sha256: 33a61116dae7f369b6ce92a7f2a1ff361ae737c675a493b11feb5570b89e0e3b + md5: 21f9a33e5fe996189e470c19c5354dbe + license: ISC + purls: [] + size: 154517 + timestamp: 1720077468981 +- kind: pypi + name: cfgv + version: 3.4.0 + url: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + sha256: b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 + requires_python: '>=3.8' +- kind: pypi + name: distlib + version: 0.3.8 + url: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + sha256: 034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 +- kind: pypi + name: filelock + version: 3.15.4 + url: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + sha256: 6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 + requires_dist: + - furo>=2023.9.10 ; extra == 'docs' + - sphinx-autodoc-typehints!=1.23.4,>=1.25.2 ; extra == 'docs' + - sphinx>=7.2.6 ; extra == 'docs' + - covdefaults>=2.3 ; extra == 'testing' + - coverage>=7.3.2 ; extra == 'testing' + - diff-cover>=8.0.1 ; extra == 'testing' + - pytest-asyncio>=0.21 ; extra == 'testing' + - pytest-cov>=4.1 ; extra == 'testing' + - pytest-mock>=3.12 ; extra == 'testing' + - pytest-timeout>=2.2 ; extra == 'testing' + - pytest>=7.4.3 ; extra == 'testing' + - virtualenv>=20.26.2 ; extra == 'testing' + - typing-extensions>=4.8 ; python_version < '3.11' and extra == 'typing' + requires_python: '>=3.8' +- kind: pypi + name: identify + version: 2.6.0 + url: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + sha256: e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0 + requires_dist: + - ukkonen ; extra == 'license' + requires_python: '>=3.8' +- kind: conda + name: ld_impl_linux-64 + version: '2.40' + build: hf3520f5_7 + build_number: 7 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-hf3520f5_7.conda + sha256: 764b6950aceaaad0c67ef925417594dd14cd2e22fff864aeef455ac259263d15 + md5: b80f2f396ca2c28b8c14c437a4ed1e74 + constrains: + - binutils_impl_linux-64 2.40 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 707602 + timestamp: 1718625640445 +- kind: conda + name: libexpat + version: 2.6.2 + build: h59595ed_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.2-h59595ed_0.conda + sha256: 331bb7c7c05025343ebd79f86ae612b9e1e74d2687b8f3179faec234f986ce19 + md5: e7ba12deb7020dd080c6c70e7b6f6a3d + depends: + - libgcc-ng >=12 + constrains: + - expat 2.6.2.* + license: MIT + license_family: MIT + purls: [] + size: 73730 + timestamp: 1710362120304 +- kind: conda + name: libexpat + version: 2.6.2 + build: h63175ca_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.6.2-h63175ca_0.conda + sha256: 79f612f75108f3e16bbdc127d4885bb74729cf66a8702fca0373dad89d40c4b7 + md5: bc592d03f62779511d392c175dcece64 + constrains: + - expat 2.6.2.* + license: MIT + license_family: MIT + purls: [] + size: 139224 + timestamp: 1710362609641 +- kind: conda + name: libexpat + version: 2.6.2 + build: h73e2aa4_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.2-h73e2aa4_0.conda + sha256: a188a77b275d61159a32ab547f7d17892226e7dac4518d2c6ac3ac8fc8dfde92 + md5: 3d1d51c8f716d97c864d12f7af329526 + constrains: + - expat 2.6.2.* + license: MIT + license_family: MIT + purls: [] + size: 69246 + timestamp: 1710362566073 +- kind: conda + name: libexpat + version: 2.6.2 + build: hebf3989_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.6.2-hebf3989_0.conda + sha256: ba7173ac30064ea901a4c9fb5a51846dcc25512ceb565759be7d18cbf3e5415e + md5: e3cde7cfa87f82f7cb13d482d5e0ad09 + constrains: + - expat 2.6.2.* + license: MIT + license_family: MIT + purls: [] + size: 63655 + timestamp: 1710362424980 +- kind: conda + name: libffi + version: 3.4.2 + build: h0d85af4_5 + build_number: 5 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 + sha256: 7a2d27a936ceee6942ea4d397f9c7d136f12549d86f7617e8b6bad51e01a941f + md5: ccb34fb14960ad8b125962d3d79b31a9 + license: MIT + license_family: MIT + purls: [] + size: 51348 + timestamp: 1636488394370 +- kind: conda + name: libffi + version: 3.4.2 + build: h3422bc3_5 + build_number: 5 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 + sha256: 41b3d13efb775e340e4dba549ab5c029611ea6918703096b2eaa9c015c0750ca + md5: 086914b672be056eb70fd4285b6783b6 + license: MIT + license_family: MIT + purls: [] + size: 39020 + timestamp: 1636488587153 +- kind: conda + name: libffi + version: 3.4.2 + build: h7f98852_5 + build_number: 5 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 + sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e + md5: d645c6d2ac96843a2bfaccd2d62b3ac3 + depends: + - libgcc-ng >=9.4.0 + license: MIT + license_family: MIT + purls: [] + size: 58292 + timestamp: 1636488182923 +- kind: conda + name: libffi + version: 3.4.2 + build: h8ffe710_5 + build_number: 5 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 + sha256: 1951ab740f80660e9bc07d2ed3aefb874d78c107264fd810f24a1a6211d4b1a5 + md5: 2c96d1b6915b408893f9472569dee135 + depends: + - vc >=14.1,<15.0a0 + - vs2015_runtime >=14.16.27012 + license: MIT + license_family: MIT + purls: [] + size: 42063 + timestamp: 1636489106777 +- kind: conda + name: libgcc-ng + version: 14.1.0 + build: h77fa898_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda + sha256: b8e869ac96591cda2704bf7e77a301025e405227791a0bddf14a3dac65125538 + md5: ca0fad6a41ddaef54a153b78eccb5037 + depends: + - _libgcc_mutex 0.1 conda_forge + - _openmp_mutex >=4.5 + constrains: + - libgomp 14.1.0 h77fa898_0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 842109 + timestamp: 1719538896937 +- kind: conda + name: libgomp + version: 14.1.0 + build: h77fa898_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.1.0-h77fa898_0.conda + sha256: 7699df61a1f6c644b3576a40f54791561f2845983120477a16116b951c9cdb05 + md5: ae061a5ed5f05818acdf9adab72c146d + depends: + - _libgcc_mutex 0.1 conda_forge + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 456925 + timestamp: 1719538796073 +- kind: conda + name: libnsl + version: 2.0.1 + build: hd590300_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda + sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6 + md5: 30fd6e37fe21f86f4bd26d6ee73eeec7 + depends: + - libgcc-ng >=12 + license: LGPL-2.1-only + license_family: GPL + purls: [] + size: 33408 + timestamp: 1697359010159 +- kind: conda + name: libsqlite + version: 3.46.0 + build: h1b8f9f3_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.0-h1b8f9f3_0.conda + sha256: 63af1a9e3284c7e4952364bafe7267e41e2d9d8bcc0e85a4ea4b0ec02d3693f6 + md5: 5dadfbc1a567fe6e475df4ce3148be09 + depends: + - __osx >=10.13 + - libzlib >=1.2.13,<2.0a0 + license: Unlicense + purls: [] + size: 908643 + timestamp: 1718050720117 +- kind: conda + name: libsqlite + version: 3.46.0 + build: h2466b09_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.0-h2466b09_0.conda + sha256: 662bd7e0d63c5b8c31cca19b91649e798319b93568a2ba8d1375efb91eeb251b + md5: 951b0a3a463932e17414cd9f047fa03d + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: Unlicense + purls: [] + size: 876677 + timestamp: 1718051113874 +- kind: conda + name: libsqlite + version: 3.46.0 + build: hde9e2c9_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.0-hde9e2c9_0.conda + sha256: daee3f68786231dad457d0dfde3f7f1f9a7f2018adabdbb864226775101341a8 + md5: 18aa975d2094c34aef978060ae7da7d8 + depends: + - libgcc-ng >=12 + - libzlib >=1.2.13,<2.0a0 + license: Unlicense + purls: [] + size: 865346 + timestamp: 1718050628718 +- kind: conda + name: libsqlite + version: 3.46.0 + build: hfb93653_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.0-hfb93653_0.conda + sha256: 73048f9cb8647d3d3bfe6021c0b7d663e12cffbe9b4f31bd081e713b0a9ad8f9 + md5: 12300188028c9bc02da965128b91b517 + depends: + - __osx >=11.0 + - libzlib >=1.2.13,<2.0a0 + license: Unlicense + purls: [] + size: 830198 + timestamp: 1718050644825 +- kind: conda + name: libuuid + version: 2.38.1 + build: h0b41bf4_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 + md5: 40b61aab5c7ba9ff276c41cfffe6b80b + depends: + - libgcc-ng >=12 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 33601 + timestamp: 1680112270483 +- kind: conda + name: libxcrypt + version: 4.4.36 + build: hd590300_1 + build_number: 1 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda + sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c + md5: 5aa797f8787fe7a17d1b0821485b5adc + depends: + - libgcc-ng >=12 + license: LGPL-2.1-or-later + purls: [] + size: 100393 + timestamp: 1702724383534 +- kind: conda + name: libzlib + version: 1.3.1 + build: h2466b09_1 + build_number: 1 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_1.conda + sha256: b13846a54a15243e15f96fec06b526d8155adc6a1ac2b6ed47a88f6a71a94b68 + md5: d4483ca8afc57ddf1f6dded53b36c17f + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + constrains: + - zlib 1.3.1 *_1 + license: Zlib + license_family: Other + purls: [] + size: 56186 + timestamp: 1716874730539 +- kind: conda + name: libzlib + version: 1.3.1 + build: h4ab18f5_1 + build_number: 1 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda + sha256: adf6096f98b537a11ae3729eaa642b0811478f0ea0402ca67b5108fe2cb0010d + md5: 57d7dc60e9325e3de37ff8dffd18e814 + depends: + - libgcc-ng >=12 + constrains: + - zlib 1.3.1 *_1 + license: Zlib + license_family: Other + purls: [] + size: 61574 + timestamp: 1716874187109 +- kind: conda + name: libzlib + version: 1.3.1 + build: h87427d6_1 + build_number: 1 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-h87427d6_1.conda + sha256: 80a62db652b1da0ccc100812a1d86e94f75028968991bfb17f9536f3aa72d91d + md5: b7575b5aa92108dcc9aaab0f05f2dbce + depends: + - __osx >=10.13 + constrains: + - zlib 1.3.1 *_1 + license: Zlib + license_family: Other + purls: [] + size: 57372 + timestamp: 1716874211519 +- kind: conda + name: libzlib + version: 1.3.1 + build: hfb2fe0b_1 + build_number: 1 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-hfb2fe0b_1.conda + sha256: c34365dd37b0eab27b9693af32a1f7f284955517c2cc91f1b88a7ef4738ff03e + md5: 636077128927cf79fd933276dc3aed47 + depends: + - __osx >=11.0 + constrains: + - zlib 1.3.1 *_1 + license: Zlib + license_family: Other + purls: [] + size: 46921 + timestamp: 1716874262512 +- kind: pypi + name: mypy + version: 1.11.1 + url: https://files.pythonhosted.org/packages/1c/21/a6b46c91b4c9d1918ee59c305f46850cde7cbea748635a352e7c3c8ed204/mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl + sha256: b868d3bcff720dd7217c383474008ddabaf048fad8d78ed948bb4b624870a417 + requires_dist: + - typing-extensions>=4.6.0 + - mypy-extensions>=1.0.0 + - tomli>=1.1.0 ; python_version < '3.11' + - psutil>=4.0 ; extra == 'dmypy' + - pip ; extra == 'install-types' + - setuptools>=50 ; extra == 'mypyc' + - lxml ; extra == 'reports' + requires_python: '>=3.8' +- kind: pypi + name: mypy + version: 1.11.1 + url: https://files.pythonhosted.org/packages/1e/b7/3a50f318979c8c541428c2f1ee973cda813bcc89614de982dafdd0df2b3e/mypy-1.11.1-cp312-cp312-win_amd64.whl + sha256: 64f4a90e3ea07f590c5bcf9029035cf0efeae5ba8be511a8caada1a4893f5525 + requires_dist: + - typing-extensions>=4.6.0 + - mypy-extensions>=1.0.0 + - tomli>=1.1.0 ; python_version < '3.11' + - psutil>=4.0 ; extra == 'dmypy' + - pip ; extra == 'install-types' + - setuptools>=50 ; extra == 'mypyc' + - lxml ; extra == 'reports' + requires_python: '>=3.8' +- kind: pypi + name: mypy + version: 1.11.1 + url: https://files.pythonhosted.org/packages/3a/34/69638cee2e87303f19a0c35e80d42757e14d9aba328f272fdcdc0bf3c9b8/mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl + sha256: f39918a50f74dc5969807dcfaecafa804fa7f90c9d60506835036cc1bc891dc8 + requires_dist: + - typing-extensions>=4.6.0 + - mypy-extensions>=1.0.0 + - tomli>=1.1.0 ; python_version < '3.11' + - psutil>=4.0 ; extra == 'dmypy' + - pip ; extra == 'install-types' + - setuptools>=50 ; extra == 'mypyc' + - lxml ; extra == 'reports' + requires_python: '>=3.8' +- kind: pypi + name: mypy + version: 1.11.1 + url: https://files.pythonhosted.org/packages/c4/3c/3e0611348fc53a4a7c80485959478b4f6eae706baf3b7c03cafa22639216/mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl + sha256: 0bc71d1fb27a428139dd78621953effe0d208aed9857cb08d002280b0422003a + requires_dist: + - typing-extensions>=4.6.0 + - mypy-extensions>=1.0.0 + - tomli>=1.1.0 ; python_version < '3.11' + - psutil>=4.0 ; extra == 'dmypy' + - pip ; extra == 'install-types' + - setuptools>=50 ; extra == 'mypyc' + - lxml ; extra == 'reports' + requires_python: '>=3.8' +- kind: pypi + name: mypy-extensions + version: 1.0.0 + url: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl + sha256: 4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d + requires_python: '>=3.5' +- kind: conda + name: ncurses + version: '6.5' + build: h5846eda_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-h5846eda_0.conda + sha256: 6ecc73db0e49143092c0934355ac41583a5d5a48c6914c5f6ca48e562d3a4b79 + md5: 02a888433d165c99bf09784a7b14d900 + license: X11 AND BSD-3-Clause + purls: [] + size: 823601 + timestamp: 1715195267791 +- kind: conda + name: ncurses + version: '6.5' + build: h59595ed_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda + sha256: 4fc3b384f4072b68853a0013ea83bdfd3d66b0126e2238e1d6e1560747aa7586 + md5: fcea371545eda051b6deafb24889fc69 + depends: + - libgcc-ng >=12 + license: X11 AND BSD-3-Clause + purls: [] + size: 887465 + timestamp: 1715194722503 +- kind: conda + name: ncurses + version: '6.5' + build: hb89a1cb_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-hb89a1cb_0.conda + sha256: 87d7cf716d9d930dab682cb57b3b8d3a61940b47d6703f3529a155c938a6990a + md5: b13ad5724ac9ae98b6b4fd87e4500ba4 + license: X11 AND BSD-3-Clause + purls: [] + size: 795131 + timestamp: 1715194898402 +- kind: pypi + name: nodeenv + version: 1.9.1 + url: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + sha256: ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' +- kind: conda + name: openssl + version: 3.3.1 + build: h2466b09_2 + build_number: 2 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.1-h2466b09_2.conda + sha256: d86c4fa31294ad9068717788197e97e5637e056c82745ffb6d0e88fd1fef1a9d + md5: 375dbc2a4d5a2e4c738703207e8e368b + depends: + - ca-certificates + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + constrains: + - pyopenssl >=22.1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 8385012 + timestamp: 1721197465883 +- kind: conda + name: openssl + version: 3.3.1 + build: h4bc722e_2 + build_number: 2 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4bc722e_2.conda + sha256: b294b3cc706ad1048cdb514f0db3da9f37ae3fcc0c53a7104083dd0918adb200 + md5: e1b454497f9f7c1147fdde4b53f1b512 + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc-ng >=12 + constrains: + - pyopenssl >=22.1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 2895213 + timestamp: 1721194688955 +- kind: conda + name: openssl + version: 3.3.1 + build: h87427d6_2 + build_number: 2 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.1-h87427d6_2.conda + sha256: 3cb0c05fbfd8cdb9b767396fc0e0af2d78eb4d68592855481254104330d4a4eb + md5: 3f3dbeedbee31e257866407d9dea1ff5 + depends: + - __osx >=10.13 + - ca-certificates + constrains: + - pyopenssl >=22.1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 2552939 + timestamp: 1721194674491 +- kind: conda + name: openssl + version: 3.3.1 + build: hfb2fe0b_2 + build_number: 2 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.1-hfb2fe0b_2.conda + sha256: dd7d988636f74473ebdfe15e05c5aabdb53a1d2a846c839d62289b0c37f81548 + md5: 9b551a504c1cc8f8b7b22c01814da8ba + depends: + - __osx >=11.0 + - ca-certificates + constrains: + - pyopenssl >=22.1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 2899682 + timestamp: 1721194599446 +- kind: pypi + name: platformdirs + version: 4.2.2 + url: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + sha256: 2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee + requires_dist: + - furo>=2023.9.10 ; extra == 'docs' + - proselint>=0.13 ; extra == 'docs' + - sphinx-autodoc-typehints>=1.25.2 ; extra == 'docs' + - sphinx>=7.2.6 ; extra == 'docs' + - appdirs==1.4.4 ; extra == 'test' + - covdefaults>=2.3 ; extra == 'test' + - pytest-cov>=4.1 ; extra == 'test' + - pytest-mock>=3.12 ; extra == 'test' + - pytest>=7.4.3 ; extra == 'test' + - mypy>=1.8 ; extra == 'type' + requires_python: '>=3.8' +- kind: pypi + name: pre-commit + version: 3.8.0 + url: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + sha256: 9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f + requires_dist: + - cfgv>=2.0.0 + - identify>=1.0.0 + - nodeenv>=0.11.1 + - pyyaml>=5.1 + - virtualenv>=20.10.0 + requires_python: '>=3.9' +- kind: pypi + name: pyarrow-stubs + version: 10.0.1.9 + path: . + sha256: c0cea94bf2145eb3466967cc8adefb3ffa864d34a3a63ea52aaecab16efea076 + requires_python: '>=3.8,<4' + editable: true +- kind: conda + name: python + version: 3.12.5 + build: h2ad013b_0_cpython + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.5-h2ad013b_0_cpython.conda + sha256: e2aad83838988725d4ffba4e9717b9328054fd18a668cff3377e0c50f109e8bd + md5: 9c56c4df45f6571b13111d8df2448692 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.6.2,<3.0a0 + - libffi >=3.4,<4.0a0 + - libgcc-ng >=12 + - libnsl >=2.0.1,<2.1.0a0 + - libsqlite >=3.46.0,<4.0a0 + - libuuid >=2.38.1,<3.0a0 + - libxcrypt >=4.4.36 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.3.1,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - xz >=5.2.6,<6.0a0 + constrains: + - python_abi 3.12.* *_cp312 + license: Python-2.0 + purls: [] + size: 31663253 + timestamp: 1723143721353 +- kind: conda + name: python + version: 3.12.5 + build: h30c5eda_0_cpython + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.5-h30c5eda_0_cpython.conda + sha256: 1319e918fb54c9491832a9731cad00235a76f61c6f9b23fc0f70cdfb74c950ea + md5: 5e315581e2948dfe3bcac306540e9803 + depends: + - __osx >=11.0 + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.6.2,<3.0a0 + - libffi >=3.4,<4.0a0 + - libsqlite >=3.46.0,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.3.1,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - xz >=5.2.6,<6.0a0 + constrains: + - python_abi 3.12.* *_cp312 + license: Python-2.0 + purls: [] + size: 12926356 + timestamp: 1723142203193 +- kind: conda + name: python + version: 3.12.5 + build: h37a9e06_0_cpython + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/python-3.12.5-h37a9e06_0_cpython.conda + sha256: c0f39e625b2fd65f70a9cc086fe4b25cc72228453dbbcd92cd5d140d080e38c5 + md5: 517cb4e16466f8d96ba2a72897d14c48 + depends: + - __osx >=10.13 + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.6.2,<3.0a0 + - libffi >=3.4,<4.0a0 + - libsqlite >=3.46.0,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.3.1,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - xz >=5.2.6,<6.0a0 + constrains: + - python_abi 3.12.* *_cp312 + license: Python-2.0 + purls: [] + size: 12173272 + timestamp: 1723142761765 +- kind: conda + name: python + version: 3.12.5 + build: h889d299_0_cpython + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/python-3.12.5-h889d299_0_cpython.conda + sha256: 4cef304eb8877fd3094c14b57097ccc1b817b4afbf2223dd45d2b61e44064740 + md5: db056d8b140ab2edd56a2f9bdb203dcd + depends: + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.6.2,<3.0a0 + - libffi >=3.4,<4.0a0 + - libsqlite >=3.46.0,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.3.1,<4.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + - xz >=5.2.6,<6.0a0 + constrains: + - python_abi 3.12.* *_cp312 + license: Python-2.0 + purls: [] + size: 15897752 + timestamp: 1723141830317 +- kind: pypi + name: pyyaml + version: 6.0.2 + url: https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl + sha256: 7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 + requires_python: '>=3.8' +- kind: pypi + name: pyyaml + version: 6.0.2 + url: https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl + sha256: c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab + requires_python: '>=3.8' +- kind: pypi + name: pyyaml + version: 6.0.2 + url: https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl + sha256: ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 + requires_python: '>=3.8' +- kind: pypi + name: pyyaml + version: 6.0.2 + url: https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 + requires_python: '>=3.8' +- kind: conda + name: readline + version: '8.2' + build: h8228510_1 + build_number: 1 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda + sha256: 5435cf39d039387fbdc977b0a762357ea909a7694d9528ab40f005e9208744d7 + md5: 47d31b792659ce70f470b5c82fdfb7a4 + depends: + - libgcc-ng >=12 + - ncurses >=6.3,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 281456 + timestamp: 1679532220005 +- kind: conda + name: readline + version: '8.2' + build: h92ec313_1 + build_number: 1 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda + sha256: a1dfa679ac3f6007362386576a704ad2d0d7a02e98f5d0b115f207a2da63e884 + md5: 8cbb776a2f641b943d413b3e19df71f4 + depends: + - ncurses >=6.3,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 250351 + timestamp: 1679532511311 +- kind: conda + name: readline + version: '8.2' + build: h9e318b2_1 + build_number: 1 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda + sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568 + md5: f17f77f2acf4d344734bda76829ce14e + depends: + - ncurses >=6.3,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 255870 + timestamp: 1679532707590 +- kind: pypi + name: ruff + version: 0.5.7 + url: https://files.pythonhosted.org/packages/3d/1d/c218ce83beb4394ba04d05e9aa2ae6ce9fba8405688fe878b0fdb40ce855/ruff-0.5.7-py3-none-macosx_11_0_arm64.whl + sha256: eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e + requires_python: '>=3.7' +- kind: pypi + name: ruff + version: 0.5.7 + url: https://files.pythonhosted.org/packages/67/1c/4520c98bfc06b9c73cd1457686d4d3935d40046b1ddea08403e5a6deff51/ruff-0.5.7-py3-none-win_amd64.whl + sha256: 083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3 + requires_python: '>=3.7' +- kind: pypi + name: ruff + version: 0.5.7 + url: https://files.pythonhosted.org/packages/a4/10/1be32aeaab8728f78f673e7a47dd813222364479b2d6573dbcf0085e83ea/ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl + sha256: 00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be + requires_python: '>=3.7' +- kind: pypi + name: ruff + version: 0.5.7 + url: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e + requires_python: '>=3.7' +- kind: conda + name: tk + version: 8.6.13 + build: h1abcd95_1 + build_number: 1 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda + sha256: 30412b2e9de4ff82d8c2a7e5d06a15f4f4fef1809a72138b6ccb53a33b26faf5 + md5: bf830ba5afc507c6232d4ef0fb1a882d + depends: + - libzlib >=1.2.13,<2.0.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3270220 + timestamp: 1699202389792 +- kind: conda + name: tk + version: 8.6.13 + build: h5083fa2_1 + build_number: 1 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda + sha256: 72457ad031b4c048e5891f3f6cb27a53cb479db68a52d965f796910e71a403a8 + md5: b50a57ba89c32b62428b71a875291c9b + depends: + - libzlib >=1.2.13,<2.0.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3145523 + timestamp: 1699202432999 +- kind: conda + name: tk + version: 8.6.13 + build: h5226925_1 + build_number: 1 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda + sha256: 2c4e914f521ccb2718946645108c9bd3fc3216ba69aea20c2c3cedbd8db32bb1 + md5: fc048363eb8f03cd1737600a5d08aafe + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: TCL + license_family: BSD + purls: [] + size: 3503410 + timestamp: 1699202577803 +- kind: conda + name: tk + version: 8.6.13 + build: noxft_h4845f30_101 + build_number: 101 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda + sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e + md5: d453b98d9c83e71da0741bb0ff4d76bc + depends: + - libgcc-ng >=12 + - libzlib >=1.2.13,<2.0.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3318875 + timestamp: 1699202167581 +- kind: pypi + name: typing-extensions + version: 4.12.2 + url: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl + sha256: 04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d + requires_python: '>=3.8' +- kind: conda + name: tzdata + version: 2024a + build: h0c530f3_0 + subdir: noarch + noarch: generic + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + sha256: 7b2b69c54ec62a243eb6fba2391b5e443421608c3ae5dbff938ad33ca8db5122 + md5: 161081fc7cec0bfda0d86d7cb595f8d8 + license: LicenseRef-Public-Domain + purls: [] + size: 119815 + timestamp: 1706886945727 +- kind: conda + name: ucrt + version: 10.0.22621.0 + build: h57928b3_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2 + sha256: f29cdaf8712008f6b419b8b1a403923b00ab2504bfe0fb2ba8eb60e72d4f14c6 + md5: 72608f6cd3e5898229c3ea16deb1ac43 + constrains: + - vs2015_runtime >=14.29.30037 + license: LicenseRef-Proprietary + license_family: PROPRIETARY + purls: [] + size: 1283972 + timestamp: 1666630199266 +- kind: conda + name: vc + version: '14.3' + build: h8a93ad2_20 + build_number: 20 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h8a93ad2_20.conda + sha256: 23ac5feb15a9adf3ab2b8c4dcd63650f8b7ae860c5ceb073e49cf71d203eddef + md5: 8558f367e1d7700554f7cdb823c46faf + depends: + - vc14_runtime >=14.40.33810 + track_features: + - vc14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 17391 + timestamp: 1717709040616 +- kind: conda + name: vc14_runtime + version: 14.40.33810 + build: ha82c5b3_20 + build_number: 20 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda + sha256: af3cfa347e3d7c1277e9b964b0849a9a9f095bff61836cb3c3a89862fbc32e17 + md5: e39cc4c34c53654ec939558993d9dc5b + depends: + - ucrt >=10.0.20348.0 + constrains: + - vs2015_runtime 14.40.33810.* *_20 + license: LicenseRef-ProprietaryMicrosoft + license_family: Proprietary + purls: [] + size: 751934 + timestamp: 1717709031266 +- kind: pypi + name: virtualenv + version: 20.26.3 + url: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + sha256: 8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 + requires_dist: + - distlib<1,>=0.3.7 + - filelock<4,>=3.12.2 + - importlib-metadata>=6.6 ; python_version < '3.8' + - platformdirs<5,>=3.9.1 + - furo>=2023.7.26 ; extra == 'docs' + - proselint>=0.13 ; extra == 'docs' + - sphinx!=7.3,>=7.1.2 ; extra == 'docs' + - sphinx-argparse>=0.4 ; extra == 'docs' + - sphinxcontrib-towncrier>=0.2.1a0 ; extra == 'docs' + - towncrier>=23.6 ; extra == 'docs' + - covdefaults>=2.3 ; extra == 'test' + - coverage-enable-subprocess>=1 ; extra == 'test' + - coverage>=7.2.7 ; extra == 'test' + - flaky>=3.7 ; extra == 'test' + - packaging>=23.1 ; extra == 'test' + - pytest-env>=0.8.2 ; extra == 'test' + - pytest-freezer>=0.4.8 ; (platform_python_implementation == 'PyPy' or (platform_python_implementation == 'CPython' and sys_platform == 'win32' and python_version >= '3.13')) and extra == 'test' + - pytest-mock>=3.11.1 ; extra == 'test' + - pytest-randomly>=3.12 ; extra == 'test' + - pytest-timeout>=2.1 ; extra == 'test' + - pytest>=7.4 ; extra == 'test' + - setuptools>=68 ; extra == 'test' + - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' + requires_python: '>=3.7' +- kind: conda + name: vs2015_runtime + version: 14.40.33810 + build: h3bf8584_20 + build_number: 20 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda + sha256: 0c2803f7a788c51f28235a7228dc2ab3f107b4b16ab0845a3e595c8c51e50a7a + md5: c21f1b4a3a30bbc3ef35a50957578e0e + depends: + - vc14_runtime >=14.40.33810 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 17395 + timestamp: 1717709043353 +- kind: conda + name: xz + version: 5.2.6 + build: h166bdaf_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 + sha256: 03a6d28ded42af8a347345f82f3eebdd6807a08526d47899a42d62d319609162 + md5: 2161070d867d1b1204ea749c8eec4ef0 + depends: + - libgcc-ng >=12 + license: LGPL-2.1 and GPL-2.0 + purls: [] + size: 418368 + timestamp: 1660346797927 +- kind: conda + name: xz + version: 5.2.6 + build: h57fd34a_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 + sha256: 59d78af0c3e071021cfe82dc40134c19dab8cdf804324b62940f5c8cd71803ec + md5: 39c6b54e94014701dd157f4f576ed211 + license: LGPL-2.1 and GPL-2.0 + purls: [] + size: 235693 + timestamp: 1660346961024 +- kind: conda + name: xz + version: 5.2.6 + build: h775f41a_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 + sha256: eb09823f34cc2dd663c0ec4ab13f246f45dcd52e5b8c47b9864361de5204a1c8 + md5: a72f9d4ea13d55d745ff1ed594747f10 + license: LGPL-2.1 and GPL-2.0 + purls: [] + size: 238119 + timestamp: 1660346964847 +- kind: conda + name: xz + version: 5.2.6 + build: h8d14728_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 + sha256: 54d9778f75a02723784dc63aff4126ff6e6749ba21d11a6d03c1f4775f269fe0 + md5: 515d77642eaa3639413c6b1bc3f94219 + depends: + - vc >=14.1,<15 + - vs2015_runtime >=14.16.27033 + license: LGPL-2.1 and GPL-2.0 + purls: [] + size: 217804 + timestamp: 1660346976440 diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index b36f56bda03..bdd73b09522 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -1,269 +1,263 @@ from typing import Any -from pyarrow._hdfsio import ( - HdfsFile as HdfsFile, - have_libhdfs as have_libhdfs, -) -from pyarrow.ipc import ( - Message as Message, - MessageReader as MessageReader, - MetadataVersion as MetadataVersion, - RecordBatchFileReader as RecordBatchFileReader, - RecordBatchFileWriter as RecordBatchFileWriter, - RecordBatchStreamReader as RecordBatchStreamReader, - RecordBatchStreamWriter as RecordBatchStreamWriter, - deserialize_pandas as deserialize_pandas, - serialize_pandas as serialize_pandas, -) -from pyarrow.lib import ( - NA as NA, - Array as Array, - ArrowCancelled as ArrowCancelled, - ArrowCapacityError as ArrowCapacityError, - ArrowException as ArrowException, - ArrowIndexError as ArrowIndexError, - ArrowInvalid as ArrowInvalid, - ArrowIOError as ArrowIOError, - ArrowKeyError as ArrowKeyError, - ArrowMemoryError as ArrowMemoryError, - ArrowNotImplementedError as ArrowNotImplementedError, - ArrowSerializationError as ArrowSerializationError, - ArrowTypeError as ArrowTypeError, - BaseExtensionType as BaseExtensionType, - BinaryArray as BinaryArray, - BinaryScalar as BinaryScalar, - BooleanArray as BooleanArray, - BooleanScalar as BooleanScalar, - Buffer as Buffer, - BufferedInputStream as BufferedInputStream, - BufferedOutputStream as BufferedOutputStream, - BufferOutputStream as BufferOutputStream, - BufferReader as BufferReader, - BuildInfo as BuildInfo, - ChunkedArray as ChunkedArray, - Codec as Codec, - CompressedInputStream as CompressedInputStream, - CompressedOutputStream as CompressedOutputStream, - DataType as DataType, - Date32Array as Date32Array, - Date32Scalar as Date32Scalar, - Date64Array as Date64Array, - Date64Scalar as Date64Scalar, - Decimal128Array as Decimal128Array, - Decimal128Scalar as Decimal128Scalar, - Decimal128Type as Decimal128Type, - Decimal256Array as Decimal256Array, - Decimal256Scalar as Decimal256Scalar, - Decimal256Type as Decimal256Type, - DenseUnionType as DenseUnionType, - DeserializationCallbackError as DeserializationCallbackError, - DictionaryArray as DictionaryArray, - DictionaryMemo as DictionaryMemo, - DictionaryScalar as DictionaryScalar, - DictionaryType as DictionaryType, - DoubleScalar as DoubleScalar, - DurationArray as DurationArray, - DurationScalar as DurationScalar, - DurationType as DurationType, - ExtensionArray as ExtensionArray, - ExtensionScalar as ExtensionScalar, - ExtensionType as ExtensionType, - Field as Field, - FixedSizeBinaryArray as FixedSizeBinaryArray, - FixedSizeBinaryScalar as FixedSizeBinaryScalar, - FixedSizeBinaryType as FixedSizeBinaryType, - FixedSizeBufferWriter as FixedSizeBufferWriter, - FixedSizeListArray as FixedSizeListArray, - FixedSizeListScalar as FixedSizeListScalar, - FixedSizeListType as FixedSizeListType, - FloatingPointArray as FloatingPointArray, - FloatScalar as FloatScalar, - HalfFloatScalar as HalfFloatScalar, - Int8Array as Int8Array, - Int8Scalar as Int8Scalar, - Int16Array as Int16Array, - Int16Scalar as Int16Scalar, - Int32Array as Int32Array, - Int32Scalar as Int32Scalar, - Int64Array as Int64Array, - Int64Scalar as Int64Scalar, - IntegerArray as IntegerArray, - KeyValueMetadata as KeyValueMetadata, - LargeBinaryArray as LargeBinaryArray, - LargeBinaryScalar as LargeBinaryScalar, - LargeListArray as LargeListArray, - LargeListScalar as LargeListScalar, - LargeListType as LargeListType, - LargeStringArray as LargeStringArray, - LargeStringScalar as LargeStringScalar, - ListArray as ListArray, - ListScalar as ListScalar, - ListType as ListType, - LoggingMemoryPool as LoggingMemoryPool, - MapArray as MapArray, - MapScalar as MapScalar, - MapType as MapType, - MemoryMappedFile as MemoryMappedFile, - MemoryPool as MemoryPool, - MockOutputStream as MockOutputStream, - MonthDayNano as MonthDayNano, - MonthDayNanoIntervalArray as MonthDayNanoIntervalArray, - MonthDayNanoIntervalScalar as MonthDayNanoIntervalScalar, - NativeFile as NativeFile, - NullArray as NullArray, - NullScalar as NullScalar, - NumericArray as NumericArray, - OSFile as OSFile, - ProxyMemoryPool as ProxyMemoryPool, - PyExtensionType as PyExtensionType, - PythonFile as PythonFile, - RecordBatch as RecordBatch, - RecordBatchReader as RecordBatchReader, - ResizableBuffer as ResizableBuffer, - RuntimeInfo as RuntimeInfo, - Scalar as Scalar, - Schema as Schema, - SerializationCallbackError as SerializationCallbackError, - SparseCOOTensor as SparseCOOTensor, - SparseCSCMatrix as SparseCSCMatrix, - SparseCSFTensor as SparseCSFTensor, - SparseCSRMatrix as SparseCSRMatrix, - SparseUnionType as SparseUnionType, - StringArray as StringArray, - StringScalar as StringScalar, - StructArray as StructArray, - StructScalar as StructScalar, - StructType as StructType, - Table as Table, - TableGroupBy as TableGroupBy, - Tensor as Tensor, - Time32Array as Time32Array, - Time32Scalar as Time32Scalar, - Time32Type as Time32Type, - Time64Array as Time64Array, - Time64Scalar as Time64Scalar, - Time64Type as Time64Type, - TimestampArray as TimestampArray, - TimestampScalar as TimestampScalar, - TimestampType as TimestampType, - TransformInputStream as TransformInputStream, - UInt8Array as UInt8Array, - UInt8Scalar as UInt8Scalar, - UInt16Array as UInt16Array, - UInt16Scalar as UInt16Scalar, - UInt32Array as UInt32Array, - UInt32Scalar as UInt32Scalar, - UInt64Array as UInt64Array, - UInt64Scalar as UInt64Scalar, - UnionArray as UnionArray, - UnionScalar as UnionScalar, - UnionType as UnionType, - UnknownExtensionType as UnknownExtensionType, - VersionInfo as VersionInfo, - allocate_buffer as allocate_buffer, - array as array, - binary as binary, - bool_ as bool_, - chunked_array as chunked_array, - compress as compress, - concat_arrays as concat_arrays, - concat_tables as concat_tables, - cpp_build_info as cpp_build_info, - cpp_version as cpp_version, - cpp_version_info as cpp_version_info, - cpu_count as cpu_count, - create_memory_map as create_memory_map, - date32 as date32, - date64 as date64, - decimal128 as decimal128, - decimal256 as decimal256, - decompress as decompress, - default_memory_pool as default_memory_pool, - dense_union as dense_union, - deserialize as deserialize, - deserialize_components as deserialize_components, - deserialize_from as deserialize_from, - dictionary as dictionary, - duration as duration, - enable_signal_handlers as enable_signal_handlers, - field as field, - float16 as float16, - float32 as float32, - float64 as float64, - foreign_buffer as foreign_buffer, - from_numpy_dtype as from_numpy_dtype, - infer_type as infer_type, - input_stream as input_stream, - int8 as int8, - int16 as int16, - int32 as int32, - int64 as int64, - io_thread_count as io_thread_count, - jemalloc_memory_pool as jemalloc_memory_pool, - jemalloc_set_decay_ms as jemalloc_set_decay_ms, - large_binary as large_binary, - large_list as large_list, - large_string as large_string, - large_utf8 as large_utf8, - list_ as list_, - log_memory_allocations as log_memory_allocations, - logging_memory_pool as logging_memory_pool, - map_ as map_, - memory_map as memory_map, - mimalloc_memory_pool as mimalloc_memory_pool, - month_day_nano_interval as month_day_nano_interval, - null as null, - nulls as nulls, - output_stream as output_stream, - proxy_memory_pool as proxy_memory_pool, - py_buffer as py_buffer, - read_serialized as read_serialized, - record_batch as record_batch, - register_extension_type as register_extension_type, - repeat as repeat, - runtime_info as runtime_info, - scalar as scalar, - schema as schema, - serialize as serialize, - serialize_to as serialize_to, - set_cpu_count as set_cpu_count, - set_io_thread_count as set_io_thread_count, - set_memory_pool as set_memory_pool, - sparse_union as sparse_union, - string as string, - struct as struct, - supported_memory_backends as supported_memory_backends, - system_memory_pool as system_memory_pool, - table as table, - time32 as time32, - time64 as time64, - timestamp as timestamp, - total_allocated_bytes as total_allocated_bytes, - transcoding_input_stream as transcoding_input_stream, - type_for_alias as type_for_alias, - uint8 as uint8, - uint16 as uint16, - uint32 as uint32, - uint64 as uint64, - unify_schemas as unify_schemas, - union as union, - unregister_extension_type as unregister_extension_type, - utf8 as utf8, -) +from pyarrow._hdfsio import HdfsFile as HdfsFile +from pyarrow._hdfsio import have_libhdfs as have_libhdfs +from pyarrow.ipc import Message as Message +from pyarrow.ipc import MessageReader as MessageReader +from pyarrow.ipc import MetadataVersion as MetadataVersion +from pyarrow.ipc import RecordBatchFileReader as RecordBatchFileReader +from pyarrow.ipc import RecordBatchFileWriter as RecordBatchFileWriter +from pyarrow.ipc import RecordBatchStreamReader as RecordBatchStreamReader +from pyarrow.ipc import RecordBatchStreamWriter as RecordBatchStreamWriter +from pyarrow.ipc import deserialize_pandas as deserialize_pandas +from pyarrow.ipc import serialize_pandas as serialize_pandas +from pyarrow.lib import NA as NA +from pyarrow.lib import Array as Array +from pyarrow.lib import ArrowCancelled as ArrowCancelled +from pyarrow.lib import ArrowCapacityError as ArrowCapacityError +from pyarrow.lib import ArrowException as ArrowException +from pyarrow.lib import ArrowIndexError as ArrowIndexError +from pyarrow.lib import ArrowInvalid as ArrowInvalid +from pyarrow.lib import ArrowIOError as ArrowIOError +from pyarrow.lib import ArrowKeyError as ArrowKeyError +from pyarrow.lib import ArrowMemoryError as ArrowMemoryError +from pyarrow.lib import ArrowNotImplementedError as ArrowNotImplementedError +from pyarrow.lib import ArrowSerializationError as ArrowSerializationError +from pyarrow.lib import ArrowTypeError as ArrowTypeError +from pyarrow.lib import BaseExtensionType as BaseExtensionType +from pyarrow.lib import BinaryArray as BinaryArray +from pyarrow.lib import BinaryScalar as BinaryScalar +from pyarrow.lib import BooleanArray as BooleanArray +from pyarrow.lib import BooleanScalar as BooleanScalar +from pyarrow.lib import Buffer as Buffer +from pyarrow.lib import BufferedInputStream as BufferedInputStream +from pyarrow.lib import BufferedOutputStream as BufferedOutputStream +from pyarrow.lib import BufferOutputStream as BufferOutputStream +from pyarrow.lib import BufferReader as BufferReader +from pyarrow.lib import BuildInfo as BuildInfo +from pyarrow.lib import ChunkedArray as ChunkedArray +from pyarrow.lib import Codec as Codec +from pyarrow.lib import CompressedInputStream as CompressedInputStream +from pyarrow.lib import CompressedOutputStream as CompressedOutputStream +from pyarrow.lib import DataType as DataType +from pyarrow.lib import Date32Array as Date32Array +from pyarrow.lib import Date32Scalar as Date32Scalar +from pyarrow.lib import Date64Array as Date64Array +from pyarrow.lib import Date64Scalar as Date64Scalar +from pyarrow.lib import Decimal128Array as Decimal128Array +from pyarrow.lib import Decimal128Scalar as Decimal128Scalar +from pyarrow.lib import Decimal128Type as Decimal128Type +from pyarrow.lib import Decimal256Array as Decimal256Array +from pyarrow.lib import Decimal256Scalar as Decimal256Scalar +from pyarrow.lib import Decimal256Type as Decimal256Type +from pyarrow.lib import DenseUnionType as DenseUnionType +from pyarrow.lib import DeserializationCallbackError as DeserializationCallbackError +from pyarrow.lib import DictionaryArray as DictionaryArray +from pyarrow.lib import DictionaryMemo as DictionaryMemo +from pyarrow.lib import DictionaryScalar as DictionaryScalar +from pyarrow.lib import DictionaryType as DictionaryType +from pyarrow.lib import DoubleScalar as DoubleScalar +from pyarrow.lib import DurationArray as DurationArray +from pyarrow.lib import DurationScalar as DurationScalar +from pyarrow.lib import DurationType as DurationType +from pyarrow.lib import ExtensionArray as ExtensionArray +from pyarrow.lib import ExtensionScalar as ExtensionScalar +from pyarrow.lib import ExtensionType as ExtensionType +from pyarrow.lib import Field as Field +from pyarrow.lib import FixedSizeBinaryArray as FixedSizeBinaryArray +from pyarrow.lib import FixedSizeBinaryScalar as FixedSizeBinaryScalar +from pyarrow.lib import FixedSizeBinaryType as FixedSizeBinaryType +from pyarrow.lib import FixedSizeBufferWriter as FixedSizeBufferWriter +from pyarrow.lib import FixedSizeListArray as FixedSizeListArray +from pyarrow.lib import FixedSizeListScalar as FixedSizeListScalar +from pyarrow.lib import FixedSizeListType as FixedSizeListType +from pyarrow.lib import FloatingPointArray as FloatingPointArray +from pyarrow.lib import FloatScalar as FloatScalar +from pyarrow.lib import HalfFloatScalar as HalfFloatScalar +from pyarrow.lib import Int8Array as Int8Array +from pyarrow.lib import Int8Scalar as Int8Scalar +from pyarrow.lib import Int16Array as Int16Array +from pyarrow.lib import Int16Scalar as Int16Scalar +from pyarrow.lib import Int32Array as Int32Array +from pyarrow.lib import Int32Scalar as Int32Scalar +from pyarrow.lib import Int64Array as Int64Array +from pyarrow.lib import Int64Scalar as Int64Scalar +from pyarrow.lib import IntegerArray as IntegerArray +from pyarrow.lib import KeyValueMetadata as KeyValueMetadata +from pyarrow.lib import LargeBinaryArray as LargeBinaryArray +from pyarrow.lib import LargeBinaryScalar as LargeBinaryScalar +from pyarrow.lib import LargeListArray as LargeListArray +from pyarrow.lib import LargeListScalar as LargeListScalar +from pyarrow.lib import LargeListType as LargeListType +from pyarrow.lib import LargeStringArray as LargeStringArray +from pyarrow.lib import LargeStringScalar as LargeStringScalar +from pyarrow.lib import ListArray as ListArray +from pyarrow.lib import ListScalar as ListScalar +from pyarrow.lib import ListType as ListType +from pyarrow.lib import LoggingMemoryPool as LoggingMemoryPool +from pyarrow.lib import MapArray as MapArray +from pyarrow.lib import MapScalar as MapScalar +from pyarrow.lib import MapType as MapType +from pyarrow.lib import MemoryMappedFile as MemoryMappedFile +from pyarrow.lib import MemoryPool as MemoryPool +from pyarrow.lib import MockOutputStream as MockOutputStream +from pyarrow.lib import MonthDayNano as MonthDayNano +from pyarrow.lib import MonthDayNanoIntervalArray as MonthDayNanoIntervalArray +from pyarrow.lib import MonthDayNanoIntervalScalar as MonthDayNanoIntervalScalar +from pyarrow.lib import NativeFile as NativeFile +from pyarrow.lib import NullArray as NullArray +from pyarrow.lib import NullScalar as NullScalar +from pyarrow.lib import NumericArray as NumericArray +from pyarrow.lib import OSFile as OSFile +from pyarrow.lib import ProxyMemoryPool as ProxyMemoryPool +from pyarrow.lib import PyExtensionType as PyExtensionType +from pyarrow.lib import PythonFile as PythonFile +from pyarrow.lib import RecordBatch as RecordBatch +from pyarrow.lib import RecordBatchReader as RecordBatchReader +from pyarrow.lib import ResizableBuffer as ResizableBuffer +from pyarrow.lib import RuntimeInfo as RuntimeInfo +from pyarrow.lib import Scalar as Scalar +from pyarrow.lib import Schema as Schema +from pyarrow.lib import SerializationCallbackError as SerializationCallbackError +from pyarrow.lib import SparseCOOTensor as SparseCOOTensor +from pyarrow.lib import SparseCSCMatrix as SparseCSCMatrix +from pyarrow.lib import SparseCSFTensor as SparseCSFTensor +from pyarrow.lib import SparseCSRMatrix as SparseCSRMatrix +from pyarrow.lib import SparseUnionType as SparseUnionType +from pyarrow.lib import StringArray as StringArray +from pyarrow.lib import StringScalar as StringScalar +from pyarrow.lib import StructArray as StructArray +from pyarrow.lib import StructScalar as StructScalar +from pyarrow.lib import StructType as StructType +from pyarrow.lib import Table as Table +from pyarrow.lib import TableGroupBy as TableGroupBy +from pyarrow.lib import Tensor as Tensor +from pyarrow.lib import Time32Array as Time32Array +from pyarrow.lib import Time32Scalar as Time32Scalar +from pyarrow.lib import Time32Type as Time32Type +from pyarrow.lib import Time64Array as Time64Array +from pyarrow.lib import Time64Scalar as Time64Scalar +from pyarrow.lib import Time64Type as Time64Type +from pyarrow.lib import TimestampArray as TimestampArray +from pyarrow.lib import TimestampScalar as TimestampScalar +from pyarrow.lib import TimestampType as TimestampType +from pyarrow.lib import TransformInputStream as TransformInputStream +from pyarrow.lib import UInt8Array as UInt8Array +from pyarrow.lib import UInt8Scalar as UInt8Scalar +from pyarrow.lib import UInt16Array as UInt16Array +from pyarrow.lib import UInt16Scalar as UInt16Scalar +from pyarrow.lib import UInt32Array as UInt32Array +from pyarrow.lib import UInt32Scalar as UInt32Scalar +from pyarrow.lib import UInt64Array as UInt64Array +from pyarrow.lib import UInt64Scalar as UInt64Scalar +from pyarrow.lib import UnionArray as UnionArray +from pyarrow.lib import UnionScalar as UnionScalar +from pyarrow.lib import UnionType as UnionType +from pyarrow.lib import UnknownExtensionType as UnknownExtensionType +from pyarrow.lib import VersionInfo as VersionInfo +from pyarrow.lib import allocate_buffer as allocate_buffer +from pyarrow.lib import array as array +from pyarrow.lib import binary as binary +from pyarrow.lib import bool_ as bool_ +from pyarrow.lib import chunked_array as chunked_array +from pyarrow.lib import compress as compress +from pyarrow.lib import concat_arrays as concat_arrays +from pyarrow.lib import concat_tables as concat_tables +from pyarrow.lib import cpp_build_info as cpp_build_info +from pyarrow.lib import cpp_version as cpp_version +from pyarrow.lib import cpp_version_info as cpp_version_info +from pyarrow.lib import cpu_count as cpu_count +from pyarrow.lib import create_memory_map as create_memory_map +from pyarrow.lib import date32 as date32 +from pyarrow.lib import date64 as date64 +from pyarrow.lib import decimal128 as decimal128 +from pyarrow.lib import decimal256 as decimal256 +from pyarrow.lib import decompress as decompress +from pyarrow.lib import default_memory_pool as default_memory_pool +from pyarrow.lib import dense_union as dense_union +from pyarrow.lib import deserialize as deserialize +from pyarrow.lib import deserialize_components as deserialize_components +from pyarrow.lib import deserialize_from as deserialize_from +from pyarrow.lib import dictionary as dictionary +from pyarrow.lib import duration as duration +from pyarrow.lib import enable_signal_handlers as enable_signal_handlers +from pyarrow.lib import field as field +from pyarrow.lib import float16 as float16 +from pyarrow.lib import float32 as float32 +from pyarrow.lib import float64 as float64 +from pyarrow.lib import foreign_buffer as foreign_buffer +from pyarrow.lib import from_numpy_dtype as from_numpy_dtype +from pyarrow.lib import infer_type as infer_type +from pyarrow.lib import input_stream as input_stream +from pyarrow.lib import int8 as int8 +from pyarrow.lib import int16 as int16 +from pyarrow.lib import int32 as int32 +from pyarrow.lib import int64 as int64 +from pyarrow.lib import io_thread_count as io_thread_count +from pyarrow.lib import jemalloc_memory_pool as jemalloc_memory_pool +from pyarrow.lib import jemalloc_set_decay_ms as jemalloc_set_decay_ms +from pyarrow.lib import large_binary as large_binary +from pyarrow.lib import large_list as large_list +from pyarrow.lib import large_string as large_string +from pyarrow.lib import large_utf8 as large_utf8 +from pyarrow.lib import list_ as list_ +from pyarrow.lib import log_memory_allocations as log_memory_allocations +from pyarrow.lib import logging_memory_pool as logging_memory_pool +from pyarrow.lib import map_ as map_ +from pyarrow.lib import memory_map as memory_map +from pyarrow.lib import mimalloc_memory_pool as mimalloc_memory_pool +from pyarrow.lib import month_day_nano_interval as month_day_nano_interval +from pyarrow.lib import null as null +from pyarrow.lib import nulls as nulls +from pyarrow.lib import output_stream as output_stream +from pyarrow.lib import proxy_memory_pool as proxy_memory_pool +from pyarrow.lib import py_buffer as py_buffer +from pyarrow.lib import read_serialized as read_serialized +from pyarrow.lib import record_batch as record_batch +from pyarrow.lib import register_extension_type as register_extension_type +from pyarrow.lib import repeat as repeat +from pyarrow.lib import runtime_info as runtime_info +from pyarrow.lib import scalar as scalar +from pyarrow.lib import schema as schema +from pyarrow.lib import serialize as serialize +from pyarrow.lib import serialize_to as serialize_to +from pyarrow.lib import set_cpu_count as set_cpu_count +from pyarrow.lib import set_io_thread_count as set_io_thread_count +from pyarrow.lib import set_memory_pool as set_memory_pool +from pyarrow.lib import sparse_union as sparse_union +from pyarrow.lib import string as string +from pyarrow.lib import struct as struct +from pyarrow.lib import supported_memory_backends as supported_memory_backends +from pyarrow.lib import system_memory_pool as system_memory_pool +from pyarrow.lib import table as table +from pyarrow.lib import time32 as time32 +from pyarrow.lib import time64 as time64 +from pyarrow.lib import timestamp as timestamp +from pyarrow.lib import total_allocated_bytes as total_allocated_bytes +from pyarrow.lib import transcoding_input_stream as transcoding_input_stream +from pyarrow.lib import type_for_alias as type_for_alias +from pyarrow.lib import uint8 as uint8 +from pyarrow.lib import uint16 as uint16 +from pyarrow.lib import uint32 as uint32 +from pyarrow.lib import uint64 as uint64 +from pyarrow.lib import unify_schemas as unify_schemas +from pyarrow.lib import union as union +from pyarrow.lib import unregister_extension_type as unregister_extension_type +from pyarrow.lib import utf8 as utf8 +from pyarrow.serialization import default_serialization_context as default_serialization_context from pyarrow.serialization import ( - default_serialization_context as default_serialization_context, register_default_serialization_handlers as register_default_serialization_handlers, +) +from pyarrow.serialization import ( register_torch_serialization_handlers as register_torch_serialization_handlers, ) -from . import ( - filesystem as filesystem, - hdfs as hdfs, - ipc as ipc, - serialization as serialization, - types as types, - util as util, -) +from . import filesystem as filesystem +from . import hdfs as hdfs +from . import ipc as ipc +from . import serialization as serialization +from . import types as types +from . import util as util def show_versions() -> None: ... def show_info() -> None: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index bfa582b79c2..0c957bea400 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -1,10 +1,9 @@ -from typing import ( - Any, - Callable, - ClassVar, -) +from typing import Any +from typing import Callable +from typing import ClassVar import pyarrow.lib + from typing_extensions import Literal namedtuple: Callable @@ -46,9 +45,7 @@ class CastOptions(_CastOptions): def unsafe(target_type: pyarrow.lib.DataType | None = ...) -> CastOptions: ... class CountOptions(_CountOptions): - def __init__( - self, mode: Literal["only_valid", "only_null", "all"] = ... - ) -> None: ... + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = ...) -> None: ... class CumulativeSumOptions(_CumulativeSumOptions): def __init__(self, start: float, *, skip_nulls: bool = ...) -> None: ... @@ -69,9 +66,7 @@ class ElementWiseAggregateOptions(_ElementWiseAggregateOptions): class Expression(pyarrow.lib._Weakrefable): def __init__(self) -> None: ... - def _call( - self, unicodefunction_name, listarguments, FunctionOptionsoptions=... - ) -> Any: ... + def _call(self, unicodefunction_name, listarguments, FunctionOptionsoptions=...) -> Any: ... @staticmethod def _deserialize(buffer: pyarrow.lib.Buffer) -> Expression: ... @staticmethod @@ -586,9 +581,7 @@ class _StructFieldOptions(FunctionOptions): class _TDigestOptions(FunctionOptions): def __init__(self, *args, **kwargs) -> None: ... - def _set_options( - self, quantiles, delta, buffer_size, skip_nulls, min_count - ) -> Any: ... + def _set_options(self, quantiles, delta, buffer_size, skip_nulls, min_count) -> Any: ... def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... @@ -672,7 +665,5 @@ def frombytes(*args, **kwargs) -> Any: ... def function_registry() -> Any: ... def get_function(name) -> Any: ... def list_functions() -> Any: ... -def register_scalar_function( - func, function_name, function_doc, in_types, out_type -) -> Any: ... +def register_scalar_function(func, function_name, function_doc, in_types, out_type) -> Any: ... def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 950061fc1b9..25faaa0ddf9 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -1,8 +1,6 @@ -from typing import ( - Any, - ClassVar, - overload, -) +from typing import Any +from typing import ClassVar +from typing import overload import pyarrow.lib @@ -171,6 +169,4 @@ def read_csv( @overload def read_csv(source) -> Any: ... def tobytes(o) -> Any: ... -def write_csv( - data, output_file, write_options=..., MemoryPoolmemory_pool=... -) -> Any: ... +def write_csv(data, output_file, write_options=..., MemoryPoolmemory_pool=...) -> Any: ... diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 03f11c1a8ac..4780a3457d7 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -1,9 +1,8 @@ import importlib._bootstrap # type: ignore -from typing import ( - Any, - ClassVar, - overload, -) + +from typing import Any +from typing import ClassVar +from typing import overload import pyarrow.lib @@ -105,9 +104,7 @@ class FileFormat(pyarrow.lib._Weakrefable): default_fragment_scan_options: Any def __init__(self, *args, **kwargs) -> None: ... def inspect(self, file, filesystem=...) -> Any: ... - def make_fragment( - self, file, filesystem=..., Expressionpartition_expression=... - ) -> Any: ... + def make_fragment(self, file, filesystem=..., Expressionpartition_expression=...) -> Any: ... def make_write_options(self) -> Any: ... def __eq__(self, other) -> Any: ... def __ge__(self, other) -> Any: ... diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi index 5a4dcaec22e..a22e63d6208 100644 --- a/pyarrow-stubs/_dataset_orc.pyi +++ b/pyarrow-stubs/_dataset_orc.pyi @@ -1,6 +1,4 @@ -from typing import ( - Any, -) +from typing import Any import pyarrow._dataset diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index e6b17d5b139..481bad265b6 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -1,7 +1,5 @@ -from typing import ( - Any, - ClassVar, -) +from typing import Any +from typing import ClassVar import pyarrow._dataset import pyarrow.lib @@ -42,9 +40,7 @@ class ParquetFileFragment(pyarrow._dataset.FileFragment): def __init__(self, *args, **kwargs) -> None: ... def ensure_complete_metadata(self) -> Any: ... def split_by_row_group(self, Expressionfilter=..., Schemaschema=...) -> Any: ... - def subset( - self, Expressionfilter=..., Schemaschema=..., row_group_ids=... - ) -> Any: ... + def subset(self, Expressionfilter=..., Schemaschema=..., row_group_ids=...) -> Any: ... def __reduce__(self) -> Any: ... class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): @@ -95,8 +91,6 @@ class RowGroupInfo: @property def total_byte_size(self) -> Any: ... -def __pyx_unpickle_ParquetReadOptions( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... +def __pyx_unpickle_ParquetReadOptions(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def frombytes(*args, **kwargs) -> Any: ... def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_exec_plan.pyi b/pyarrow-stubs/_exec_plan.pyi index b06dadfab4b..f20551c1ecb 100644 --- a/pyarrow-stubs/_exec_plan.pyi +++ b/pyarrow-stubs/_exec_plan.pyi @@ -1,6 +1,4 @@ -from typing import ( - Any, -) +from typing import Any import pyarrow._dataset diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 01bb7b23857..f2e4c20944e 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -1,12 +1,12 @@ import enum import importlib._bootstrap # type: ignore import re -from typing import ( - Any, - ClassVar, -) + +from typing import Any +from typing import ClassVar import pyarrow.lib + from pyarrow.lib import Schema _FLIGHT_SERVER_ERROR_REGEX: re.Pattern @@ -124,9 +124,7 @@ class FlightClient(pyarrow.lib._Weakrefable): override_hostname=..., disable_server_verification=..., ) -> Any: ... - def do_action( - self, action, FlightCallOptionsoptions: FlightCallOptions = ... - ) -> Any: ... + def do_action(self, action, FlightCallOptionsoptions: FlightCallOptions = ...) -> Any: ... def do_exchange( self, FlightDescriptordescriptor: FlightDescriptor, @@ -151,9 +149,7 @@ class FlightClient(pyarrow.lib._Weakrefable): FlightDescriptordescriptor: FlightDescriptor, FlightCallOptionsoptions: FlightCallOptions = ..., ) -> Any: ... - def list_actions( - self, FlightCallOptionsoptions: FlightCallOptions = ... - ) -> Any: ... + def list_actions(self, FlightCallOptionsoptions: FlightCallOptions = ...) -> Any: ... def list_flights( self, bytescriteria: bytes = ..., @@ -531,9 +527,7 @@ class _FlightServerFinalizer(pyarrow.lib._Weakrefable): def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... -class _MetadataRecordBatchReader( - pyarrow.lib._Weakrefable, pyarrow.lib._ReadPandasMixin -): +class _MetadataRecordBatchReader(pyarrow.lib._Weakrefable, pyarrow.lib._ReadPandasMixin): schema: Any @classmethod def __init__(self, *args, **kwargs) -> None: ... @@ -560,49 +554,23 @@ class _ServerMiddlewareWrapper(ServerMiddleware): def __reduce__(self) -> Any: ... def __setstate__(self, state) -> Any: ... -def __pyx_unpickle_ClientAuthHandler( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_ClientMiddleware( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_ClientMiddlewareFactory( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightCancelledError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightDataStream( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... +def __pyx_unpickle_ClientAuthHandler(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_ClientMiddleware(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_ClientMiddlewareFactory(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightCancelledError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightDataStream(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def __pyx_unpickle_FlightError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightInternalError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightServerError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightTimedOutError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... +def __pyx_unpickle_FlightInternalError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightServerError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightTimedOutError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def __pyx_unpickle_FlightUnauthenticatedError( __pyx_type, long__pyx_checksum, __pyx_state ) -> Any: ... -def __pyx_unpickle_FlightUnauthorizedError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightUnavailableError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_ServerAuthHandler( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_ServerMiddleware( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_ServerMiddlewareFactory( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... +def __pyx_unpickle_FlightUnauthorizedError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_FlightUnavailableError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_ServerAuthHandler(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_ServerMiddleware(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle_ServerMiddlewareFactory(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def __pyx_unpickle_TracingServerMiddlewareFactory( __pyx_type, long__pyx_checksum, __pyx_state ) -> Any: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 6042c6641f3..9f165e4121c 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -1,14 +1,13 @@ +import _abc # type: ignore import abc import datetime import enum import importlib._bootstrap # type: ignore -from typing import ( - Any, - Callable, - ClassVar, -) -import _abc # type: ignore +from typing import Any +from typing import Callable +from typing import ClassVar + import pyarrow.lib Directory: importlib._bootstrap.FileType @@ -61,14 +60,10 @@ class FileSystem(pyarrow.lib._Weakrefable): def get_file_info(self, paths_or_selector) -> Any: ... def move(self, src, dest) -> Any: ... def normalize_path(self, path) -> Any: ... - def open_append_stream( - self, path, compression=..., buffer_size=..., metadata=... - ) -> Any: ... + def open_append_stream(self, path, compression=..., buffer_size=..., metadata=...) -> Any: ... def open_input_file(self, path) -> Any: ... def open_input_stream(self, path, compression=..., buffer_size=...) -> Any: ... - def open_output_stream( - self, path, compression=..., buffer_size=..., metadata=... - ) -> Any: ... + def open_output_stream(self, path, compression=..., buffer_size=..., metadata=...) -> Any: ... def __eq__(self, other) -> Any: ... def __ge__(self, other) -> Any: ... def __gt__(self, other) -> Any: ... diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi index 8f1d95ebd6f..682f7e59c9b 100644 --- a/pyarrow-stubs/_gcsfs.pyi +++ b/pyarrow-stubs/_gcsfs.pyi @@ -1,9 +1,8 @@ import collections.abc import datetime -from typing import ( - Any, - ClassVar, -) + +from typing import Any +from typing import ClassVar import pyarrow._fs import pyarrow.lib diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index 39853b2f9ed..584285128eb 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -1,7 +1,5 @@ -from typing import ( - Any, - Callable, -) +from typing import Any +from typing import Callable import pyarrow._fs diff --git a/pyarrow-stubs/_hdfsio.pyi b/pyarrow-stubs/_hdfsio.pyi index bbaac6e66d2..367c727bf34 100644 --- a/pyarrow-stubs/_hdfsio.pyi +++ b/pyarrow-stubs/_hdfsio.pyi @@ -1,10 +1,10 @@ import re -from typing import ( - Any, - overload, -) + +from typing import Any +from typing import overload import pyarrow.lib + from typing_extensions import Literal _HDFS_PATH_RE: re.Pattern diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index 80a7ca77e18..9a11b8f3f15 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -1,7 +1,5 @@ -from typing import ( - Any, - ClassVar, -) +from typing import Any +from typing import ClassVar import pyarrow.lib diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 75a2120f14d..d1568982746 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -1,10 +1,9 @@ -from typing import ( - Any, - ClassVar, - Generator, -) +from typing import Any +from typing import ClassVar +from typing import Generator import pyarrow.lib + from typing_extensions import Literal _stringify_path: function @@ -34,9 +33,7 @@ class ColumnChunkMetaData(pyarrow.lib._Weakrefable): @property def compression( self, - ) -> Literal[ - "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO", "BROTLI", "LZ4", "ZSTD", "UNKNOWN" - ]: ... + ) -> Literal["UNCOMPRESSED", "SNAPPY", "GZIP", "LZO", "BROTLI", "LZ4", "ZSTD", "UNKNOWN"]: ... @property def encodings( self, @@ -200,9 +197,7 @@ class ParquetWriter(pyarrow.lib._Weakrefable): dictionary_pagesize_limit: int | None = ..., ) -> None: ... def close(self) -> None: ... - def write_table( - self, table: pyarrow.lib.Table, row_group_size: int | None = ... - ) -> None: ... + def write_table(self, table: pyarrow.lib.Table, row_group_size: int | None = ...) -> None: ... class RowGroupMetaData(pyarrow.lib._Weakrefable): __hash__: ClassVar[None] = ... # type: ignore diff --git a/pyarrow-stubs/_parquet_encryption.pyi b/pyarrow-stubs/_parquet_encryption.pyi index 0c654725d77..df0688c792d 100644 --- a/pyarrow-stubs/_parquet_encryption.pyi +++ b/pyarrow-stubs/_parquet_encryption.pyi @@ -1,8 +1,7 @@ import datetime -from typing import ( - Any, - ClassVar, -) + +from typing import Any +from typing import ClassVar import pyarrow.lib diff --git a/pyarrow-stubs/_plasma.pyi b/pyarrow-stubs/_plasma.pyi index a85a750cd52..f85704e3b4a 100644 --- a/pyarrow-stubs/_plasma.pyi +++ b/pyarrow-stubs/_plasma.pyi @@ -1,14 +1,12 @@ import socket -from typing import ( - Any, - overload, -) + +from typing import Any +from typing import overload import pyarrow.lib -from typing_extensions import ( - Literal, - TypedDict, -) + +from typing_extensions import Literal +from typing_extensions import TypedDict PLASMA_WAIT_TIMEOUT: int @@ -41,13 +39,9 @@ class PlasmaClient(pyarrow.lib._Weakrefable): def create( self, object_id: ObjectID, data_size: int, metadata: bytes = ... ) -> pyarrow.lib.Buffer: ... - def create_and_seal( - self, object_id: ObjectID, data: bytes, metadata: bytes = ... - ) -> None: ... + def create_and_seal(self, object_id: ObjectID, data: bytes, metadata: bytes = ...) -> None: ... def debug_string(self) -> str: ... - def decode_notifications( - self, buf: pyarrow.lib.Buffer - ) -> tuple[list[ObjectID], int, int]: ... + def decode_notifications(self, buf: pyarrow.lib.Buffer) -> tuple[list[ObjectID], int, int]: ... def delete(self, object_ids: list[ObjectID]) -> None: ... def disconnect(self) -> None: ... def evict(self, num_bytes: int) -> None: ... @@ -93,9 +87,7 @@ class PlasmaClient(pyarrow.lib._Weakrefable): memcopy_threads: int = ..., ) -> ObjectID: ... def seal(self, object_id: ObjectID) -> None: ... - def set_client_options( - self, client_name: str, limit_output_memory: int - ) -> None: ... + def set_client_options(self, client_name: str, limit_output_memory: int) -> None: ... def store_capacity(self) -> int: ... def subscribe(self) -> None: ... def to_capsule(self) -> Any: ... diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi index 2dc1018a535..66daeb5b4f9 100644 --- a/pyarrow-stubs/_s3fs.pyi +++ b/pyarrow-stubs/_s3fs.pyi @@ -1,9 +1,8 @@ import enum import importlib._bootstrap # type: ignore -from typing import ( - Any, - ClassVar, -) + +from typing import Any +from typing import ClassVar import pyarrow._fs import pyarrow.lib diff --git a/pyarrow-stubs/_substrait.pyi b/pyarrow-stubs/_substrait.pyi index 6ba7498b010..38bd13ad5e0 100644 --- a/pyarrow-stubs/_substrait.pyi +++ b/pyarrow-stubs/_substrait.pyi @@ -1,13 +1,9 @@ -from typing import ( - Callable, - NamedTuple, -) +from typing import Callable +from typing import NamedTuple -from pyarrow.lib import ( - Buffer, - RecordBatchReader, - Table, -) +from pyarrow.lib import Buffer +from pyarrow.lib import RecordBatchReader +from pyarrow.lib import Table def _parse_json_plan(plan: bytes) -> Buffer: ... def get_supported_functions() -> list[str]: ... diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index c12a65dac79..ea778d56549 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1,79 +1,75 @@ from typing import TypeVar from numpy.typing import ArrayLike -from pyarrow._compute import ( - ArraySortOptions as ArraySortOptions, - AssumeTimezoneOptions as AssumeTimezoneOptions, - CastOptions as CastOptions, - CountOptions as CountOptions, - CumulativeSumOptions as CumulativeSumOptions, - DayOfWeekOptions as DayOfWeekOptions, - DictionaryEncodeOptions as DictionaryEncodeOptions, - ElementWiseAggregateOptions as ElementWiseAggregateOptions, - Expression as Expression, - ExtractRegexOptions as ExtractRegexOptions, - FilterOptions as FilterOptions, - Function as Function, - FunctionOptions as FunctionOptions, - FunctionRegistry as FunctionRegistry, - HashAggregateFunction as HashAggregateFunction, - HashAggregateKernel as HashAggregateKernel, - IndexOptions as IndexOptions, - JoinOptions as JoinOptions, - Kernel as Kernel, - MakeStructOptions as MakeStructOptions, - MapLookupOptions as MapLookupOptions, - MatchSubstringOptions as MatchSubstringOptions, - ModeOptions as ModeOptions, - NullOptions as NullOptions, - PadOptions as PadOptions, - PartitionNthOptions as PartitionNthOptions, - QuantileOptions as QuantileOptions, - RandomOptions as RandomOptions, - RankOptions as RankOptions, - ReplaceSliceOptions as ReplaceSliceOptions, - ReplaceSubstringOptions as ReplaceSubstringOptions, - RoundOptions as RoundOptions, - RoundTemporalOptions as RoundTemporalOptions, - RoundToMultipleOptions as RoundToMultipleOptions, - ScalarAggregateFunction as ScalarAggregateFunction, - ScalarAggregateKernel as ScalarAggregateKernel, - ScalarAggregateOptions as ScalarAggregateOptions, - ScalarFunction as ScalarFunction, - ScalarKernel as ScalarKernel, - ScalarUdfContext as ScalarUdfContext, - SelectKOptions as SelectKOptions, - SetLookupOptions as SetLookupOptions, - SliceOptions as SliceOptions, - SortOptions as SortOptions, - SplitOptions as SplitOptions, - SplitPatternOptions as SplitPatternOptions, - StrftimeOptions as StrftimeOptions, - StrptimeOptions as StrptimeOptions, - StructFieldOptions as StructFieldOptions, - TakeOptions as TakeOptions, - TDigestOptions as TDigestOptions, - TrimOptions as TrimOptions, - Utf8NormalizeOptions as Utf8NormalizeOptions, - VarianceOptions as VarianceOptions, - VectorFunction as VectorFunction, - VectorKernel as VectorKernel, - WeekOptions as WeekOptions, - call_function as call_function, - function_registry as function_registry, - get_function as get_function, - list_functions as list_functions, - register_scalar_function as register_scalar_function, -) -from pyarrow.lib import ( - Array, - ChunkedArray, - DataType, - MemoryPool, - RecordBatch, - Scalar, - Table, -) +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import ScalarUdfContext as ScalarUdfContext +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import call_function as call_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow.lib import Array +from pyarrow.lib import ChunkedArray +from pyarrow.lib import DataType +from pyarrow.lib import MemoryPool +from pyarrow.lib import RecordBatch +from pyarrow.lib import Scalar +from pyarrow.lib import Table from pyarrow.vendored import docscrape as docscrape def cast( diff --git a/pyarrow-stubs/csv.pyi b/pyarrow-stubs/csv.pyi index 84c34f87bc5..d1d481b9eaa 100644 --- a/pyarrow-stubs/csv.pyi +++ b/pyarrow-stubs/csv.pyi @@ -1,13 +1,11 @@ -from pyarrow._csv import ( - ISO8601 as ISO8601, - ConvertOptions as ConvertOptions, - CSVStreamingReader as CSVStreamingReader, - CSVWriter as CSVWriter, - InvalidRow as InvalidRow, - ParseOptions as ParseOptions, - ReadOptions as ReadOptions, - WriteOptions as WriteOptions, - open_csv as open_csv, - read_csv as read_csv, - write_csv as write_csv, -) +from pyarrow._csv import ISO8601 as ISO8601 +from pyarrow._csv import ConvertOptions as ConvertOptions +from pyarrow._csv import CSVStreamingReader as CSVStreamingReader +from pyarrow._csv import CSVWriter as CSVWriter +from pyarrow._csv import InvalidRow as InvalidRow +from pyarrow._csv import ParseOptions as ParseOptions +from pyarrow._csv import ReadOptions as ReadOptions +from pyarrow._csv import WriteOptions as WriteOptions +from pyarrow._csv import open_csv as open_csv +from pyarrow._csv import read_csv as read_csv +from pyarrow._csv import write_csv as write_csv diff --git a/pyarrow-stubs/cuda.pyi b/pyarrow-stubs/cuda.pyi index 8512e5a13e6..2fd7051ae40 100644 --- a/pyarrow-stubs/cuda.pyi +++ b/pyarrow-stubs/cuda.pyi @@ -1,12 +1,10 @@ -from pyarrow._cuda import ( - BufferReader as BufferReader, - BufferWriter as BufferWriter, - Context as Context, - CudaBuffer as CudaBuffer, - HostBuffer as HostBuffer, - IpcMemHandle as IpcMemHandle, - new_host_buffer as new_host_buffer, - read_message as read_message, - read_record_batch as read_record_batch, - serialize_record_batch as serialize_record_batch, -) +from pyarrow._cuda import BufferReader as BufferReader +from pyarrow._cuda import BufferWriter as BufferWriter +from pyarrow._cuda import Context as Context +from pyarrow._cuda import CudaBuffer as CudaBuffer +from pyarrow._cuda import HostBuffer as HostBuffer +from pyarrow._cuda import IpcMemHandle as IpcMemHandle +from pyarrow._cuda import new_host_buffer as new_host_buffer +from pyarrow._cuda import read_message as read_message +from pyarrow._cuda import read_record_batch as read_record_batch +from pyarrow._cuda import serialize_record_batch as serialize_record_batch diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 8b40492c7ec..453906a722d 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -1,62 +1,52 @@ from os import PathLike -from typing import ( - Callable, - Iterable, -) +from typing import Callable +from typing import Iterable -from pyarrow._dataset import ( - CsvFileFormat as CsvFileFormat, - CsvFragmentScanOptions as CsvFragmentScanOptions, - Dataset as Dataset, - DatasetFactory as DatasetFactory, - DirectoryPartitioning as DirectoryPartitioning, - FeatherFileFormat as FeatherFileFormat, - FileFormat as FileFormat, - FileFragment as FileFragment, - FilenamePartitioning as FilenamePartitioning, - FileSystemDataset as FileSystemDataset, - FileSystemDatasetFactory as FileSystemDatasetFactory, - FileSystemFactoryOptions as FileSystemFactoryOptions, - FileWriteOptions as FileWriteOptions, - Fragment as Fragment, - FragmentScanOptions as FragmentScanOptions, - HivePartitioning as HivePartitioning, - InMemoryDataset as InMemoryDataset, - IpcFileFormat as IpcFileFormat, - IpcFileWriteOptions as IpcFileWriteOptions, - Partitioning as Partitioning, - PartitioningFactory as PartitioningFactory, - Scanner as Scanner, - TaggedRecordBatch as TaggedRecordBatch, - UnionDataset as UnionDataset, - UnionDatasetFactory as UnionDatasetFactory, - WrittenFile as WrittenFile, -) +from pyarrow._dataset import CsvFileFormat as CsvFileFormat +from pyarrow._dataset import CsvFragmentScanOptions as CsvFragmentScanOptions +from pyarrow._dataset import Dataset as Dataset +from pyarrow._dataset import DatasetFactory as DatasetFactory +from pyarrow._dataset import DirectoryPartitioning as DirectoryPartitioning +from pyarrow._dataset import FeatherFileFormat as FeatherFileFormat +from pyarrow._dataset import FileFormat as FileFormat +from pyarrow._dataset import FileFragment as FileFragment +from pyarrow._dataset import FilenamePartitioning as FilenamePartitioning +from pyarrow._dataset import FileSystemDataset as FileSystemDataset +from pyarrow._dataset import FileSystemDatasetFactory as FileSystemDatasetFactory +from pyarrow._dataset import FileSystemFactoryOptions as FileSystemFactoryOptions +from pyarrow._dataset import FileWriteOptions as FileWriteOptions +from pyarrow._dataset import Fragment as Fragment +from pyarrow._dataset import FragmentScanOptions as FragmentScanOptions +from pyarrow._dataset import HivePartitioning as HivePartitioning +from pyarrow._dataset import InMemoryDataset as InMemoryDataset +from pyarrow._dataset import IpcFileFormat as IpcFileFormat +from pyarrow._dataset import IpcFileWriteOptions as IpcFileWriteOptions +from pyarrow._dataset import Partitioning as Partitioning +from pyarrow._dataset import PartitioningFactory as PartitioningFactory +from pyarrow._dataset import Scanner as Scanner +from pyarrow._dataset import TaggedRecordBatch as TaggedRecordBatch +from pyarrow._dataset import UnionDataset as UnionDataset +from pyarrow._dataset import UnionDatasetFactory as UnionDatasetFactory +from pyarrow._dataset import WrittenFile as WrittenFile from pyarrow._dataset_orc import OrcFileFormat as OrcFileFormat -from pyarrow._dataset_parquet import ( - ParquetDatasetFactory as ParquetDatasetFactory, - ParquetFactoryOptions as ParquetFactoryOptions, - ParquetFileFormat as ParquetFileFormat, - ParquetFileFragment as ParquetFileFragment, - ParquetFileWriteOptions as ParquetFileWriteOptions, - ParquetFragmentScanOptions as ParquetFragmentScanOptions, - ParquetReadOptions as ParquetReadOptions, - RowGroupInfo as RowGroupInfo, -) -from pyarrow.compute import ( - Expression as Expression, - field as field, - scalar as scalar, -) +from pyarrow._dataset_parquet import ParquetDatasetFactory as ParquetDatasetFactory +from pyarrow._dataset_parquet import ParquetFactoryOptions as ParquetFactoryOptions +from pyarrow._dataset_parquet import ParquetFileFormat as ParquetFileFormat +from pyarrow._dataset_parquet import ParquetFileFragment as ParquetFileFragment +from pyarrow._dataset_parquet import ParquetFileWriteOptions as ParquetFileWriteOptions +from pyarrow._dataset_parquet import ParquetFragmentScanOptions as ParquetFragmentScanOptions +from pyarrow._dataset_parquet import ParquetReadOptions as ParquetReadOptions +from pyarrow._dataset_parquet import RowGroupInfo as RowGroupInfo +from pyarrow.compute import Expression as Expression +from pyarrow.compute import field as field +from pyarrow.compute import scalar as scalar from pyarrow.dataset import Dataset from pyarrow.filesystem import FileSystem -from pyarrow.lib import ( - Array, - RecordBatch, - RecordBatchReader, - Schema, - Table, -) +from pyarrow.lib import Array +from pyarrow.lib import RecordBatch +from pyarrow.lib import RecordBatchReader +from pyarrow.lib import Schema +from pyarrow.lib import Table from typing_extensions import Literal def __getattr__(name: str) -> None: ... @@ -85,11 +75,7 @@ def dataset( ignore_prefixes: list[str] | None = ..., ) -> Dataset: ... def write_dataset( - data: Dataset - | Table - | RecordBatch - | RecordBatchReader - | Iterable[Table | RecordBatch], + data: Dataset | Table | RecordBatch | RecordBatchReader | Iterable[Table | RecordBatch], base_dir: str, *, basename_template: str | None = ..., @@ -106,8 +92,6 @@ def write_dataset( min_rows_per_group: int | None = ..., max_rows_per_group: int | None = ..., file_visitor: Callable[[WrittenFile], None] | None = ..., - existing_data_behavior: Literal[ - "error", "overwrite_or_ignore", "delete_matching" - ] = ..., + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = ..., create_dir: bool = ..., ) -> None: ... diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi index 480162706aa..280bfd58f32 100644 --- a/pyarrow-stubs/feather.pyi +++ b/pyarrow-stubs/feather.pyi @@ -2,16 +2,15 @@ from io import IOBase from typing import overload import pandas as pd + from pyarrow._feather import FeatherError as FeatherError -from pyarrow.lib import ( - ChunkedArray, - Codec as Codec, - NativeFile, - Schema, - Table as Table, - concat_tables as concat_tables, - schema as schema, -) +from pyarrow.lib import ChunkedArray +from pyarrow.lib import Codec as Codec +from pyarrow.lib import NativeFile +from pyarrow.lib import Schema +from pyarrow.lib import Table as Table +from pyarrow.lib import concat_tables as concat_tables +from pyarrow.lib import schema as schema from pyarrow.vendored.version import Version as Version from typing_extensions import Literal @@ -19,9 +18,7 @@ class FeatherDataset: paths: list[str] validate_schema: bool schema: Schema - def __init__( - self, path_or_paths: list[str], validate_schema: bool = ... - ) -> None: ... + def __init__(self, path_or_paths: list[str], validate_schema: bool = ...) -> None: ... def read_table(self, columns: list[str] | None = ...) -> Table: ... def validate_schemas(self, piece: str, table: Table) -> None: ... def read_pandas( diff --git a/pyarrow-stubs/filesystem.pyi b/pyarrow-stubs/filesystem.pyi index 41841ebec88..286ab48ceb7 100644 --- a/pyarrow-stubs/filesystem.pyi +++ b/pyarrow-stubs/filesystem.pyi @@ -1,10 +1,8 @@ from os import PathLike from typing import Generator -from pyarrow import ( - Table, - parquet, -) +from pyarrow import Table +from pyarrow import parquet from pyarrow._gcsfs import GcsFileSystem from pyarrow._s3fs import S3FileSystem @@ -38,9 +36,7 @@ class LocalFileSystem(FileSystem): def __init__(self) -> None: ... @classmethod def get_instance(cls) -> LocalFileSystem: ... - def walk( - self, path: str - ) -> Generator[tuple[str, list[str], list[str]], None, None]: ... + def walk(self, path: str) -> Generator[tuple[str, list[str], list[str]], None, None]: ... class DaskFileSystem(FileSystem): fs: S3FileSystem | GcsFileSystem diff --git a/pyarrow-stubs/flight.pyi b/pyarrow-stubs/flight.pyi index 557016ffdcd..5377495d085 100644 --- a/pyarrow-stubs/flight.pyi +++ b/pyarrow-stubs/flight.pyi @@ -1,47 +1,45 @@ -from pyarrow._flight import ( - Action as Action, - ActionType as ActionType, - BasicAuth as BasicAuth, - CallInfo as CallInfo, - CertKeyPair as CertKeyPair, - ClientAuthHandler as ClientAuthHandler, - ClientMiddleware as ClientMiddleware, - ClientMiddlewareFactory as ClientMiddlewareFactory, - DescriptorType as DescriptorType, - FlightCallOptions as FlightCallOptions, - FlightCancelledError as FlightCancelledError, - FlightClient as FlightClient, - FlightDataStream as FlightDataStream, - FlightDescriptor as FlightDescriptor, - FlightEndpoint as FlightEndpoint, - FlightError as FlightError, - FlightInfo as FlightInfo, - FlightInternalError as FlightInternalError, - FlightMetadataReader as FlightMetadataReader, - FlightMetadataWriter as FlightMetadataWriter, - FlightMethod as FlightMethod, - FlightServerBase as FlightServerBase, - FlightServerError as FlightServerError, - FlightStreamChunk as FlightStreamChunk, - FlightStreamReader as FlightStreamReader, - FlightStreamWriter as FlightStreamWriter, - FlightTimedOutError as FlightTimedOutError, - FlightUnauthenticatedError as FlightUnauthenticatedError, - FlightUnauthorizedError as FlightUnauthorizedError, - FlightUnavailableError as FlightUnavailableError, - FlightWriteSizeExceededError as FlightWriteSizeExceededError, - GeneratorStream as GeneratorStream, - Location as Location, - MetadataRecordBatchReader as MetadataRecordBatchReader, - MetadataRecordBatchWriter as MetadataRecordBatchWriter, - RecordBatchStream as RecordBatchStream, - Result as Result, - SchemaResult as SchemaResult, - ServerAuthHandler as ServerAuthHandler, - ServerCallContext as ServerCallContext, - ServerMiddleware as ServerMiddleware, - ServerMiddlewareFactory as ServerMiddlewareFactory, - Ticket as Ticket, - TracingServerMiddlewareFactory as TracingServerMiddlewareFactory, - connect as connect, -) +from pyarrow._flight import Action as Action +from pyarrow._flight import ActionType as ActionType +from pyarrow._flight import BasicAuth as BasicAuth +from pyarrow._flight import CallInfo as CallInfo +from pyarrow._flight import CertKeyPair as CertKeyPair +from pyarrow._flight import ClientAuthHandler as ClientAuthHandler +from pyarrow._flight import ClientMiddleware as ClientMiddleware +from pyarrow._flight import ClientMiddlewareFactory as ClientMiddlewareFactory +from pyarrow._flight import DescriptorType as DescriptorType +from pyarrow._flight import FlightCallOptions as FlightCallOptions +from pyarrow._flight import FlightCancelledError as FlightCancelledError +from pyarrow._flight import FlightClient as FlightClient +from pyarrow._flight import FlightDataStream as FlightDataStream +from pyarrow._flight import FlightDescriptor as FlightDescriptor +from pyarrow._flight import FlightEndpoint as FlightEndpoint +from pyarrow._flight import FlightError as FlightError +from pyarrow._flight import FlightInfo as FlightInfo +from pyarrow._flight import FlightInternalError as FlightInternalError +from pyarrow._flight import FlightMetadataReader as FlightMetadataReader +from pyarrow._flight import FlightMetadataWriter as FlightMetadataWriter +from pyarrow._flight import FlightMethod as FlightMethod +from pyarrow._flight import FlightServerBase as FlightServerBase +from pyarrow._flight import FlightServerError as FlightServerError +from pyarrow._flight import FlightStreamChunk as FlightStreamChunk +from pyarrow._flight import FlightStreamReader as FlightStreamReader +from pyarrow._flight import FlightStreamWriter as FlightStreamWriter +from pyarrow._flight import FlightTimedOutError as FlightTimedOutError +from pyarrow._flight import FlightUnauthenticatedError as FlightUnauthenticatedError +from pyarrow._flight import FlightUnauthorizedError as FlightUnauthorizedError +from pyarrow._flight import FlightUnavailableError as FlightUnavailableError +from pyarrow._flight import FlightWriteSizeExceededError as FlightWriteSizeExceededError +from pyarrow._flight import GeneratorStream as GeneratorStream +from pyarrow._flight import Location as Location +from pyarrow._flight import MetadataRecordBatchReader as MetadataRecordBatchReader +from pyarrow._flight import MetadataRecordBatchWriter as MetadataRecordBatchWriter +from pyarrow._flight import RecordBatchStream as RecordBatchStream +from pyarrow._flight import Result as Result +from pyarrow._flight import SchemaResult as SchemaResult +from pyarrow._flight import ServerAuthHandler as ServerAuthHandler +from pyarrow._flight import ServerCallContext as ServerCallContext +from pyarrow._flight import ServerMiddleware as ServerMiddleware +from pyarrow._flight import ServerMiddlewareFactory as ServerMiddlewareFactory +from pyarrow._flight import Ticket as Ticket +from pyarrow._flight import TracingServerMiddlewareFactory as TracingServerMiddlewareFactory +from pyarrow._flight import connect as connect diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi index 4a0c92e7d54..02e5e9cfd14 100644 --- a/pyarrow-stubs/fs.pyi +++ b/pyarrow-stubs/fs.pyi @@ -1,27 +1,23 @@ from _typeshed import Incomplete from pyarrow import PythonFile -from pyarrow._fs import ( - FileInfo as FileInfo, - FileSelector as FileSelector, - FileSystem as FileSystem, - FileSystemHandler as FileSystemHandler, - FileType as FileType, - LocalFileSystem as LocalFileSystem, - PyFileSystem as PyFileSystem, - SubTreeFileSystem as SubTreeFileSystem, -) +from pyarrow._fs import FileInfo as FileInfo +from pyarrow._fs import FileSelector as FileSelector +from pyarrow._fs import FileSystem as FileSystem +from pyarrow._fs import FileSystemHandler as FileSystemHandler +from pyarrow._fs import FileType as FileType +from pyarrow._fs import LocalFileSystem as LocalFileSystem +from pyarrow._fs import PyFileSystem as PyFileSystem +from pyarrow._fs import SubTreeFileSystem as SubTreeFileSystem from pyarrow._gcsfs import GcsFileSystem as GcsFileSystem from pyarrow._hdfs import HadoopFileSystem as HadoopFileSystem -from pyarrow._s3fs import ( - AwsDefaultS3RetryStrategy as AwsDefaultS3RetryStrategy, - AwsStandardS3RetryStrategy as AwsStandardS3RetryStrategy, - S3FileSystem as S3FileSystem, - S3LogLevel as S3LogLevel, - S3RetryStrategy as S3RetryStrategy, - finalize_s3 as finalize_s3, - initialize_s3 as initialize_s3, - resolve_s3_region as resolve_s3_region, -) +from pyarrow._s3fs import AwsDefaultS3RetryStrategy as AwsDefaultS3RetryStrategy +from pyarrow._s3fs import AwsStandardS3RetryStrategy as AwsStandardS3RetryStrategy +from pyarrow._s3fs import S3FileSystem as S3FileSystem +from pyarrow._s3fs import S3LogLevel as S3LogLevel +from pyarrow._s3fs import S3RetryStrategy as S3RetryStrategy +from pyarrow._s3fs import finalize_s3 as finalize_s3 +from pyarrow._s3fs import initialize_s3 as initialize_s3 +from pyarrow._s3fs import resolve_s3_region as resolve_s3_region FileStats = FileInfo diff --git a/pyarrow-stubs/hdfs.pyi b/pyarrow-stubs/hdfs.pyi index b2800777f09..1e7f18b798f 100644 --- a/pyarrow-stubs/hdfs.pyi +++ b/pyarrow-stubs/hdfs.pyi @@ -1,7 +1,8 @@ from collections.abc import Generator -from _typeshed import Incomplete import pyarrow._hdfsio as _hdfsio + +from _typeshed import Incomplete from pyarrow.filesystem import FileSystem as FileSystem from pyarrow.util import implements as implements @@ -16,9 +17,7 @@ class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): # type: ignore extra_conf: Incomplete | None = ..., ) -> None: ... def __reduce__(self) -> tuple: ... - def walk( - self, top_path: str - ) -> Generator[tuple[str, list[str], list[str]], None, None]: ... + def walk(self, top_path: str) -> Generator[tuple[str, list[str], list[str]], None, None]: ... def connect( host: str = ..., diff --git a/pyarrow-stubs/ipc.pyi b/pyarrow-stubs/ipc.pyi index bc424f734c5..095ad106e2e 100644 --- a/pyarrow-stubs/ipc.pyi +++ b/pyarrow-stubs/ipc.pyi @@ -1,29 +1,28 @@ from io import IOBase import pandas as pd -from pyarrow import ipc import pyarrow.lib as lib -from pyarrow.lib import ( - Buffer, - IpcReadOptions as IpcReadOptions, - IpcWriteOptions as IpcWriteOptions, - MemoryPool, - Message as Message, - MessageReader as MessageReader, - MetadataVersion as MetadataVersion, - NativeFile, - ReadStats as ReadStats, - RecordBatchReader as RecordBatchReader, - Schema, - WriteStats as WriteStats, - get_record_batch_size as get_record_batch_size, - get_tensor_size as get_tensor_size, - read_message as read_message, - read_record_batch as read_record_batch, - read_schema as read_schema, - read_tensor as read_tensor, - write_tensor as write_tensor, -) + +from pyarrow import ipc +from pyarrow.lib import Buffer +from pyarrow.lib import IpcReadOptions as IpcReadOptions +from pyarrow.lib import IpcWriteOptions as IpcWriteOptions +from pyarrow.lib import MemoryPool +from pyarrow.lib import Message as Message +from pyarrow.lib import MessageReader as MessageReader +from pyarrow.lib import MetadataVersion as MetadataVersion +from pyarrow.lib import NativeFile +from pyarrow.lib import ReadStats as ReadStats +from pyarrow.lib import RecordBatchReader as RecordBatchReader +from pyarrow.lib import Schema +from pyarrow.lib import WriteStats as WriteStats +from pyarrow.lib import get_record_batch_size as get_record_batch_size +from pyarrow.lib import get_tensor_size as get_tensor_size +from pyarrow.lib import read_message as read_message +from pyarrow.lib import read_record_batch as read_record_batch +from pyarrow.lib import read_schema as read_schema +from pyarrow.lib import read_tensor as read_tensor +from pyarrow.lib import write_tensor as write_tensor class RecordBatchStreamReader(lib._RecordBatchStreamReader): def __init__( @@ -97,6 +96,4 @@ def serialize_pandas( nthreads: int | None = ..., preserve_index: bool | None = ..., ) -> Buffer: ... -def deserialize_pandas( - buf: memoryview | Buffer, *, use_threads: bool = ... -) -> pd.DataFrame: ... +def deserialize_pandas(buf: memoryview | Buffer, *, use_threads: bool = ...) -> pd.DataFrame: ... diff --git a/pyarrow-stubs/json.pyi b/pyarrow-stubs/json.pyi index 6d83ce4f85c..59f0939f480 100644 --- a/pyarrow-stubs/json.pyi +++ b/pyarrow-stubs/json.pyi @@ -1,5 +1,3 @@ -from pyarrow._json import ( - ParseOptions as ParseOptions, - ReadOptions as ReadOptions, - read_json as read_json, -) +from pyarrow._json import ParseOptions as ParseOptions +from pyarrow._json import ReadOptions as ReadOptions +from pyarrow._json import read_json as read_json diff --git a/pyarrow-stubs/jvm.pyi b/pyarrow-stubs/jvm.pyi index 43ebf5b7845..02372c5baa0 100644 --- a/pyarrow-stubs/jvm.pyi +++ b/pyarrow-stubs/jvm.pyi @@ -1,11 +1,9 @@ from _typeshed import Incomplete -from pyarrow.lib import ( - Array, - Buffer, - Field, - RecordBatch, - Schema, -) +from pyarrow.lib import Array +from pyarrow.lib import Buffer +from pyarrow.lib import Field +from pyarrow.lib import RecordBatch +from pyarrow.lib import Schema class _JvmBufferNanny: ref_manager: Incomplete diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index 60d0b53aee3..b0a7e3600f4 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1,44 +1,38 @@ +import _io # type: ignore import collections.abc import datetime as dt -from decimal import Decimal import enum import importlib._bootstrap # type: ignore import io + +from decimal import Decimal from os import PathLike from types import ModuleType -from typing import ( - Any, - Callable, - ClassVar, - Generator, - Generic, - ItemsView, - Iterable, - KeysView, - NamedTuple, - TypeVar, - ValuesView, - overload, -) -from typing_extensions import Buffer as _Buffer +from typing import Any +from typing import Callable +from typing import ClassVar +from typing import Generator +from typing import Generic +from typing import ItemsView +from typing import Iterable +from typing import KeysView +from typing import NamedTuple +from typing import TypeVar +from typing import ValuesView +from typing import overload -import _io # type: ignore import numpy as np -from numpy.typing import ( - ArrayLike, - DTypeLike, - NDArray, -) import pandas as pd -from pyarrow.compute import ( - CastOptions, - FunctionOptions, -) -from typing_extensions import ( - Literal, - TypeAlias, - TypeGuard, -) + +from numpy.typing import ArrayLike +from numpy.typing import DTypeLike +from numpy.typing import NDArray +from pyarrow.compute import CastOptions +from pyarrow.compute import FunctionOptions +from typing_extensions import Buffer as _Buffer +from typing_extensions import Literal +from typing_extensions import TypeAlias +from typing_extensions import TypeGuard _ArrowType: TypeAlias = int | DataType _builtin_slice = slice @@ -130,8 +124,7 @@ class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): def _to_pandas( self, options: dict[str, Any], - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype | None] - | None = ..., + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype | None] | None = ..., **kwargs, ) -> pd.Series: ... def buffers(self) -> list[Buffer | None]: ... @@ -278,18 +271,14 @@ class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): @overload def cast( self, - target_type: Literal[ - "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" - ], + target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], safe: bool = ..., options: CastOptions = ..., ) -> TimestampArray: ... @overload def cast( self, - target_type: Literal[ - "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" - ], + target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], safe: bool = ..., options: CastOptions = ..., ) -> DurationArray: ... @@ -351,9 +340,7 @@ class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): def sum(self, **kwargs) -> Any: ... def take( self: _Array, - indices: list[int] - | IntegerArray - | NDArray[np.signedinteger | np.unsignedinteger], + indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], ) -> _Array: ... def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> NDArray: ... def to_pylist(self) -> list[_T]: ... @@ -389,9 +376,7 @@ class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): @overload def view(self, target_type: Literal["u8", "uint64"]) -> UInt64Array: ... @overload - def view( - self, target_type: Literal["f2", "halffloat", "float16"] - ) -> HalfFloatArray: ... + def view(self, target_type: Literal["f2", "halffloat", "float16"]) -> HalfFloatArray: ... @overload def view(self, target_type: Literal["f4", "float", "float32"]) -> FloatArray: ... @overload @@ -417,16 +402,12 @@ class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): @overload def view( self, - target_type: Literal[ - "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" - ], + target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], ) -> TimestampArray: ... @overload def view( self, - target_type: Literal[ - "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" - ], + target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], ) -> DurationArray: ... @overload def view( @@ -553,9 +534,7 @@ class ChunkedArray(_PandasConvertibleToSeries, Generic[_T, _Scalar]): def cast(self, target_type=..., safe=..., options=...) -> Any: ... def chunk(self, i: int) -> Array[_T, _Scalar]: ... def combine_chunks(self, memory_pool: MemoryPool | None = ...) -> Table: ... - def dictionary_encode( - self: _ChunkedArray, null_encoding: str = ... - ) -> _ChunkedArray: ... + def dictionary_encode(self: _ChunkedArray, null_encoding: str = ...) -> _ChunkedArray: ... def drop_null(self: _ChunkedArray) -> _ChunkedArray: ... def equals(self, other) -> bool: ... def fill_null(self: _ChunkedArray, fill_value: _T) -> _ChunkedArray: ... @@ -565,9 +544,7 @@ class ChunkedArray(_PandasConvertibleToSeries, Generic[_T, _Scalar]): *, null_selection_behavior: Literal["drop", "emit_null"] = ..., ) -> _ChunkedArray: ... - def flatten( - self: _ChunkedArray, memory_pool: MemoryPool | None = ... - ) -> _ChunkedArray: ... + def flatten(self: _ChunkedArray, memory_pool: MemoryPool | None = ...) -> _ChunkedArray: ... def format(self, **kwargs) -> str: ... def get_total_buffer_size(self) -> int: ... def index( @@ -587,9 +564,7 @@ class ChunkedArray(_PandasConvertibleToSeries, Generic[_T, _Scalar]): ) -> _ChunkedArray: ... def take( self: _ChunkedArray, - indices: list[int] - | IntegerArray - | NDArray[np.signedinteger | np.unsignedinteger], + indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], ) -> _ChunkedArray: ... def to_numpy(self) -> NDArray: ... def to_pylist(self) -> list[_T]: ... @@ -642,9 +617,7 @@ class Codec(_Weakrefable): asbytes: Literal[True] = ..., memory_pool: MemoryPool | None = ..., ) -> bytes: ... - def decompress( - self, buf, decompressed_size=..., asbytes=..., memory_pool=... - ) -> Any: ... + def decompress(self, buf, decompressed_size=..., asbytes=..., memory_pool=...) -> Any: ... @staticmethod def default_compression_level(compression: _COMPRESSION) -> int: ... @staticmethod @@ -769,9 +742,7 @@ class ExtensionArray(Array, Generic[_T, _Scalar, _StorageArray]): class ExtensionScalar(Scalar[_T]): value: Scalar[_T] @staticmethod - def from_storage( - self, typ: BaseExtensionType[_T], value: object - ) -> ExtensionScalar[_T]: ... + def from_storage(self, typ: BaseExtensionType[_T], value: object) -> ExtensionScalar[_T]: ... class ExtensionType(BaseExtensionType[_T]): def __init__(self, storage_type: DataType[_T], extension_name: str) -> None: ... @@ -979,9 +950,7 @@ class LoggingMemoryPool(MemoryPool): ... _Key = TypeVar("_Key") _Item = TypeVar("_Item") -class MapArray( - ListArray[dict[_Key, _Item], MapScalar, StructArray], Generic[_Key, _Item] -): +class MapArray(ListArray[dict[_Key, _Item], MapScalar, StructArray], Generic[_Key, _Item]): items: Array[_Item, Scalar[_Item]] keys: Array[_Key, Scalar[_Key]] @@ -1020,9 +989,7 @@ class Message(_Weakrefable): metadata_version: MetadataVersion type: str def equals(self, other: Message) -> bool: ... - def serialize( - self, alignment: int = ..., memory_pool: MemoryPool | None = ... - ) -> Any: ... + def serialize(self, alignment: int = ..., memory_pool: MemoryPool | None = ...) -> Any: ... def serialize_to( self, sink: NativeFile, @@ -1182,9 +1149,7 @@ class RecordBatch(_PandasConvertibleToDataFrame): def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... def take( self: _Self, - indices: list[int] - | IntegerArray - | NDArray[np.signedinteger | np.unsignedinteger], + indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], ) -> _Self: ... def to_pydict(self) -> dict[str, list]: ... def to_pylist(self) -> list[dict[str, Any]]: ... @@ -1206,9 +1171,7 @@ class RecordBatchReader(_Weakrefable): def _import_from_c(in_ptr: int) -> RecordBatchReader: ... def close(self) -> None: ... @staticmethod - def from_batches( - schema: Schema, batches: Iterable[RecordBatch] - ) -> RecordBatchReader: ... + def from_batches(schema: Schema, batches: Iterable[RecordBatch]) -> RecordBatchReader: ... def read_all(self) -> Table: ... def read_next_batch(self) -> RecordBatch: ... def read_pandas(self, **options) -> pd.DataFrame: ... @@ -1247,9 +1210,7 @@ class Scalar(_Weakrefable, Generic[_T]): @overload def cast(self, target_type: Literal["u8", "uint64"]) -> UInt64Scalar: ... @overload - def cast( - self, target_type: Literal["f2", "halffloat", "float16"] - ) -> HalfFloatScalar: ... + def cast(self, target_type: Literal["f2", "halffloat", "float16"]) -> HalfFloatScalar: ... @overload def cast(self, target_type: Literal["f4", "float", "float32"]) -> FloatScalar: ... @overload @@ -1271,22 +1232,16 @@ class Scalar(_Weakrefable, Generic[_T]): @overload def cast(self, target_type: Literal["time32[s]", "time32[ms]"]) -> Time32Scalar: ... @overload - def cast( - self, target_type: Literal["time64[us]", "time64[ns]"] - ) -> Time64Scalar: ... + def cast(self, target_type: Literal["time64[us]", "time64[ns]"]) -> Time64Scalar: ... @overload def cast( self, - target_type: Literal[ - "timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]" - ], + target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], ) -> TimestampScalar: ... @overload def cast( self, - target_type: Literal[ - "duration[s]", "duration[ms]", "duration[us]", "duration[ns]" - ], + target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], ) -> DurationScalar: ... @overload def cast( @@ -1307,18 +1262,14 @@ class Schema(_Weakrefable): def _field(self, i: int) -> Field: ... @staticmethod def _import_from_c(in_ptr: int) -> Schema: ... - def add_metadata( - self: _Self, metadata: dict[str | bytes, str | bytes] - ) -> _Self: ... + def add_metadata(self: _Self, metadata: dict[str | bytes, str | bytes]) -> _Self: ... def append(self: _Self, field: Field) -> _Self: ... def empty_table(self: _Self) -> _Self: ... def equals(self, other: Schema, check_metadata: bool = ...) -> bool: ... def field(self, i: int) -> Field: ... def field_by_name(self, name: str) -> Field | None: ... @classmethod - def from_pandas( - cls, df: pd.DataFrame, preserve_index: bool | None = ... - ) -> Schema: ... + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = ...) -> Schema: ... def get_all_field_indices(self, name: str) -> list[int]: ... def get_field_index(self, name: str) -> int: ... def insert(self: _Self, i: int, field: Field) -> _Self: ... @@ -1332,9 +1283,7 @@ class Schema(_Weakrefable): show_field_metadata: bool = ..., show_schema_metadata: bool = ..., ) -> str: ... - def with_metadata( - self: _Self, metadata: dict[str | bytes, str | bytes] - ) -> _Self: ... + def with_metadata(self: _Self, metadata: dict[str | bytes, str | bytes]) -> _Self: ... def __eq__(self, other) -> bool: ... def __getitem__(self, key: int) -> Field: ... def __iter__(self) -> Generator[Field, None, None]: ... @@ -1401,9 +1350,7 @@ class SparseCOOTensor(_Weakrefable, Generic[_T]): data: NDArray, coords: NDArray, shape: tuple, dim_names: list[str] | None = ... ) -> SparseCOOTensor: ... @staticmethod - def from_pydata_sparse( - obj, dim_names: list[str] | None = ... - ) -> SparseCOOTensor: ... + def from_pydata_sparse(obj, dim_names: list[str] | None = ...) -> SparseCOOTensor: ... @staticmethod def from_scipy(obj, dim_names: list[str] | None = ...) -> SparseCOOTensor: ... @staticmethod @@ -1454,9 +1401,7 @@ class SparseCSFTensor(_Weakrefable, Generic[_T]): def dim_name(self, i: int) -> str: ... def equals(self, other: SparseCSFTensor) -> bool: ... @staticmethod - def from_dense_numpy( - obj: NDArray, dim_names: list[str] | None = ... - ) -> SparseCSFTensor: ... + def from_dense_numpy(obj: NDArray, dim_names: list[str] | None = ...) -> SparseCSFTensor: ... @staticmethod def from_numpy( data: NDArray, @@ -1567,9 +1512,7 @@ class Table(_PandasConvertibleToDataFrame): def _to_pandas( self, options, categories=..., ignore_metadata=..., types_mapper=... ) -> Any: ... - def add_column( - self: _Self, i: int, field_: str | Field, column: Array - ) -> _Self: ... + def add_column(self: _Self, i: int, field_: str | Field, column: Array) -> _Self: ... def append_column(self: _Self, field_: str | Field, column: Array) -> _Self: ... def cast( self, @@ -1649,9 +1592,7 @@ class Table(_PandasConvertibleToDataFrame): self: _Self, metadata: dict[str | bytes, str | bytes] | None = ... ) -> _Self: ... def select(self, columns: list[str]) -> Table: ... - def set_column( - self: _Self, i: int, field_: str | Field, column: Array - ) -> _Self: ... + def set_column(self: _Self, i: int, field_: str | Field, column: Array) -> _Self: ... def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... def sort_by( self, @@ -1660,19 +1601,13 @@ class Table(_PandasConvertibleToDataFrame): ) -> Table: ... def take( self: _Self, - indices: list[int] - | IntegerArray - | NDArray[np.signedinteger | np.unsignedinteger], + indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], ) -> _Self: ... def to_batches(self, max_chunksize: int | None = ...) -> list[RecordBatch]: ... def to_pylist(self) -> list[dict]: ... def to_reader(self, max_chunksize: int | None = ...) -> RecordBatchReader: ... - def to_string( - self, *, show_metadata: bool = ..., preview_cols: int = ... - ) -> str: ... - def unify_dictionaries( - self: _Self, memory_pool: MemoryPool | None = ... - ) -> _Self: ... + def to_string(self, *, show_metadata: bool = ..., preview_cols: int = ...) -> str: ... + def unify_dictionaries(self: _Self, memory_pool: MemoryPool | None = ...) -> _Self: ... def validate(self, *, full: bool = ...) -> None: ... def __eq__(self, other) -> bool: ... @overload @@ -1859,8 +1794,7 @@ class _PandasConvertibleToDataFrame(_Weakrefable): safe: bool | None = ..., split_blocks: bool | None = ..., self_destruct: bool | None = ..., - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] - | None = ..., + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] | None = ..., ) -> pd.DataFrame: ... class _PandasConvertibleToSeries(_Weakrefable): @@ -1879,8 +1813,7 @@ class _PandasConvertibleToSeries(_Weakrefable): safe: bool | None = ..., split_blocks: bool | None = ..., self_destruct: bool | None = ..., - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] - | None = ..., + types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] | None = ..., ) -> pd.Series: ... class _ReadPandasMixin: @@ -1924,9 +1857,7 @@ class _RecordBatchFileWriter(_RecordBatchStreamWriter): class _RecordBatchStreamReader(RecordBatchReader): stats: Any - def _open( - self, source, IpcReadOptionsoptions=..., MemoryPoolmemory_pool=... - ) -> Any: ... + def _open(self, source, IpcReadOptionsoptions=..., MemoryPoolmemory_pool=...) -> Any: ... def __next__(self) -> RecordBatch: ... class _RecordBatchStreamWriter(_CRecordBatchWriter): @@ -1981,15 +1912,9 @@ class ordered_dict: def __setitem__(self, index, object) -> Any: ... def __sizeof__(self) -> Any: ... -def __pyx_unpickle_SerializationContext( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle__PandasAPIShim( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle__PandasConvertible( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... +def __pyx_unpickle_SerializationContext(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle__PandasAPIShim(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... +def __pyx_unpickle__PandasConvertible(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def __pyx_unpickle___Pyx_EnumMeta(*args, **kwargs) -> Any: ... def _datetime_from_int(int64_tvalue, TimeUnitunit, tzinfo=...) -> Any: ... def _deprecate_serialization(name) -> Any: ... @@ -2042,9 +1967,7 @@ def bool_() -> DataType[bool]: ... @overload def chunked_array(arrays: Array[_T, _Scalar]) -> ChunkedArray[_T, _Scalar]: ... @overload -def chunked_array( - arrays: Array, type: DataType[_T] -) -> ChunkedArray[_T, Scalar[_T]]: ... +def chunked_array(arrays: Array, type: DataType[_T]) -> ChunkedArray[_T, Scalar[_T]]: ... @overload def compress( buf: Buffer | bytes | memoryview, @@ -2060,9 +1983,7 @@ def compress( asbytes: Literal[True], memory_pool: MemoryPool | None = ..., ) -> bytes: ... -def concat_arrays( - arrays: list[_Array], memory_pool: MemoryPool | None = ... -) -> _Array: ... +def concat_arrays(arrays: list[_Array], memory_pool: MemoryPool | None = ...) -> _Array: ... def concat_tables( tables: list[Table], promote: bool = ..., memory_pool: MemoryPool | None = ... ) -> Table: ... @@ -2094,9 +2015,7 @@ def dense_union( child_fields: list[Field], type_codes: list[int] | None = ... ) -> DenseUnionType: ... def deserialize(obj, context: SerializationContext = ...) -> object: ... -def deserialize_components( - components: dict, context: SerializationContext = ... -) -> object: ... +def deserialize_components(components: dict, context: SerializationContext = ...) -> object: ... def deserialize_from( source: NativeFile, base: object, context: SerializationContext = ... ) -> object: ... @@ -2145,9 +2064,7 @@ def large_binary() -> DataType[bytes]: ... def large_list(value_type: DataType[_T] | Field[_T]) -> LargeListType[_T]: ... def large_string() -> DataType[str]: ... def large_utf8() -> DataType[str]: ... -def list_( - value_type: DataType[_T] | Field[_T], list_size: int = ... -) -> ListType[_T]: ... +def list_(value_type: DataType[_T] | Field[_T], list_size: int = ...) -> ListType[_T]: ... def log_memory_allocations(enable: bool = ...) -> None: ... def logging_memory_pool(parent: MemoryPool) -> MemoryPool: ... def map_( @@ -2225,13 +2142,9 @@ def struct(fields: Iterable[Field]) -> StructType: ... def supported_memory_backends() -> list[str]: ... def system_memory_pool() -> MemoryPool: ... @overload -def table( - df: pd.DataFrame, schema: Schema | None = ..., nthreads: int | None = ... -) -> Table: ... +def table(df: pd.DataFrame, schema: Schema | None = ..., nthreads: int | None = ...) -> Table: ... @overload -def table( - data: RecordBatch, schema: Schema | None = ..., nthreads: int | None = ... -) -> Table: ... +def table(data: RecordBatch, schema: Schema | None = ..., nthreads: int | None = ...) -> Table: ... @overload def table( arrays: list[Array], diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index 472e318408a..e205c825b03 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,16 +1,12 @@ from io import IOBase -from pyarrow._orc import ( - ORCReader as _ORCReader, - ORCWriter as _ORCWriter, -) -from pyarrow.lib import ( - KeyValueMetadata, - NativeFile, - RecordBatch, - Schema, - Table, -) +from pyarrow._orc import ORCReader as _ORCReader +from pyarrow._orc import ORCWriter as _ORCWriter +from pyarrow.lib import KeyValueMetadata +from pyarrow.lib import NativeFile +from pyarrow.lib import RecordBatch +from pyarrow.lib import Schema +from pyarrow.lib import Table from ._fs import FileSystem diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi index d3567ffbc27..76655afd361 100644 --- a/pyarrow-stubs/pandas_compat.pyi +++ b/pyarrow-stubs/pandas_compat.pyi @@ -1,19 +1,16 @@ -from typing import ( - Any, - Callable, -) +from typing import Any +from typing import Callable import numpy as np import pandas as pd + from pandas.core.internals import BlockManager -from pyarrow.lib import ( - Array, - DataType, - Schema, - Table, - _ArrowType, - frombytes as frombytes, -) +from pyarrow.lib import Array +from pyarrow.lib import DataType +from pyarrow.lib import Schema +from pyarrow.lib import Table +from pyarrow.lib import _ArrowType +from pyarrow.lib import frombytes as frombytes from typing_extensions import TypedDict class _SerializedDict(TypedDict): diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index d6f9cb071a1..30494d61ea8 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -1,41 +1,35 @@ +import pathlib + from io import IOBase from os import PathLike -import pathlib -from typing import ( - Callable, - Generator, - Generic, - TypeVar, -) +from typing import Callable +from typing import Generator +from typing import Generic +from typing import TypeVar -from _typeshed import Incomplete import pyarrow -from pyarrow import ( - Array, - NativeFile, - RecordBatch, - Schema, - Table, -) -from pyarrow._parquet import ( - ColumnChunkMetaData as ColumnChunkMetaData, - ColumnSchema as ColumnSchema, - FileDecryptionProperties as FileDecryptionProperties, - FileEncryptionProperties as FileEncryptionProperties, - FileMetaData as FileMetaData, - ParquetLogicalType as ParquetLogicalType, - ParquetReader as ParquetReader, - ParquetSchema as ParquetSchema, - RowGroupMetaData as RowGroupMetaData, - Statistics as Statistics, -) + +from _typeshed import Incomplete +from pyarrow import Array +from pyarrow import NativeFile +from pyarrow import RecordBatch +from pyarrow import Schema +from pyarrow import Table +from pyarrow._parquet import ColumnChunkMetaData as ColumnChunkMetaData +from pyarrow._parquet import ColumnSchema as ColumnSchema +from pyarrow._parquet import FileDecryptionProperties as FileDecryptionProperties +from pyarrow._parquet import FileEncryptionProperties as FileEncryptionProperties +from pyarrow._parquet import FileMetaData as FileMetaData +from pyarrow._parquet import ParquetLogicalType as ParquetLogicalType +from pyarrow._parquet import ParquetReader as ParquetReader +from pyarrow._parquet import ParquetSchema as ParquetSchema +from pyarrow._parquet import RowGroupMetaData as RowGroupMetaData +from pyarrow._parquet import Statistics as Statistics from pyarrow.compute import Expression from pyarrow.dataset import Partitioning from pyarrow.fs import FileSystem -from typing_extensions import ( - Literal, - TypeAlias, -) +from typing_extensions import Literal +from typing_extensions import TypeAlias def filters_to_expression( filters: list[tuple[str, str, str] | list[tuple[str, str, str]]], @@ -100,9 +94,7 @@ class ParquetFile: use_threads: bool = ..., use_pandas_metadata: bool = ..., ) -> pyarrow.Table: ... - def scan_contents( - self, columns: list[int] | None = ..., batch_size: int = ... - ) -> int: ... + def scan_contents(self, columns: list[int] | None = ..., batch_size: int = ...) -> int: ... _COMPRESSION: TypeAlias = Literal["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"] @@ -144,9 +136,7 @@ class ParquetWriter: table_or_batch: Table | RecordBatch, row_group_size: int | None = ..., ) -> None: ... - def write_batch( - self, batch: RecordBatch, row_group_size: int | None = ... - ) -> None: ... + def write_batch(self, batch: RecordBatch, row_group_size: int | None = ...) -> None: ... def write_table(self, table: Table, row_group_size: int | None = ...) -> None: ... def close(self) -> None: ... diff --git a/pyarrow-stubs/parquet/encryption.pyi b/pyarrow-stubs/parquet/encryption.pyi index 53918ce5927..713edb3aa6a 100644 --- a/pyarrow-stubs/parquet/encryption.pyi +++ b/pyarrow-stubs/parquet/encryption.pyi @@ -1,7 +1,5 @@ -from pyarrow._parquet_encryption import ( - CryptoFactory as CryptoFactory, - DecryptionConfiguration as DecryptionConfiguration, - EncryptionConfiguration as EncryptionConfiguration, - KmsClient as KmsClient, - KmsConnectionConfig as KmsConnectionConfig, -) +from pyarrow._parquet_encryption import CryptoFactory as CryptoFactory +from pyarrow._parquet_encryption import DecryptionConfiguration as DecryptionConfiguration +from pyarrow._parquet_encryption import EncryptionConfiguration as EncryptionConfiguration +from pyarrow._parquet_encryption import KmsClient as KmsClient +from pyarrow._parquet_encryption import KmsConnectionConfig as KmsConnectionConfig diff --git a/pyarrow-stubs/plasma.pyi b/pyarrow-stubs/plasma.pyi index c3407adf769..76f41c9d7a8 100644 --- a/pyarrow-stubs/plasma.pyi +++ b/pyarrow-stubs/plasma.pyi @@ -2,16 +2,14 @@ from collections.abc import Generator from subprocess import Popen from types import ModuleType -from pyarrow._plasma import ( - ObjectID as ObjectID, - ObjectNotAvailable as ObjectNotAvailable, - PlasmaBuffer as PlasmaBuffer, - PlasmaClient as PlasmaClient, - PlasmaObjectExists as PlasmaObjectExists, - PlasmaObjectNotFound as PlasmaObjectNotFound, - PlasmaStoreFull as PlasmaStoreFull, - connect as connect, -) +from pyarrow._plasma import ObjectID as ObjectID +from pyarrow._plasma import ObjectNotAvailable as ObjectNotAvailable +from pyarrow._plasma import PlasmaBuffer as PlasmaBuffer +from pyarrow._plasma import PlasmaClient as PlasmaClient +from pyarrow._plasma import PlasmaObjectExists as PlasmaObjectExists +from pyarrow._plasma import PlasmaObjectNotFound as PlasmaObjectNotFound +from pyarrow._plasma import PlasmaStoreFull as PlasmaStoreFull +from pyarrow._plasma import connect as connect TF_PLASMA_OP_PATH: str tf_plasma_op: ModuleType | None diff --git a/pyarrow-stubs/serialization.pyi b/pyarrow-stubs/serialization.pyi index 677cb9f5249..f40af9d8c4c 100644 --- a/pyarrow-stubs/serialization.pyi +++ b/pyarrow-stubs/serialization.pyi @@ -1,8 +1,6 @@ -from pyarrow.lib import ( - SerializationContext as SerializationContext, - builtin_pickle as builtin_pickle, - py_buffer as py_buffer, -) +from pyarrow.lib import SerializationContext as SerializationContext +from pyarrow.lib import builtin_pickle as builtin_pickle +from pyarrow.lib import py_buffer as py_buffer try: import cloudpickle # type: ignore diff --git a/pyarrow-stubs/substrait.pyi b/pyarrow-stubs/substrait.pyi index c4b612d38f4..da9956b89d7 100644 --- a/pyarrow-stubs/substrait.pyi +++ b/pyarrow-stubs/substrait.pyi @@ -1,4 +1,2 @@ -from pyarrow._substrait import ( - get_supported_functions as get_supported_functions, - run_query as run_query, -) +from pyarrow._substrait import get_supported_functions as get_supported_functions +from pyarrow._substrait import run_query as run_query diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi index 9a545bd12c6..8c981fde8ef 100644 --- a/pyarrow-stubs/types.pyi +++ b/pyarrow-stubs/types.pyi @@ -1,9 +1,7 @@ -from pyarrow.lib import ( - DataType, - is_boolean_value as is_boolean_value, - is_float_value as is_float_value, - is_integer_value as is_integer_value, -) +from pyarrow.lib import DataType +from pyarrow.lib import is_boolean_value as is_boolean_value +from pyarrow.lib import is_float_value as is_float_value +from pyarrow.lib import is_integer_value as is_integer_value def is_null(t: DataType) -> bool: ... def is_boolean(t: DataType) -> bool: ... diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi index b117b78c62f..cd8f7f2b5fc 100644 --- a/pyarrow-stubs/util.pyi +++ b/pyarrow-stubs/util.pyi @@ -1,8 +1,6 @@ from collections.abc import Sequence -from typing import ( - Callable, - TypeVar, -) +from typing import Callable +from typing import TypeVar _T = TypeVar("_T") diff --git a/pyproject.toml b/pyproject.toml index 1cc508c3b04..1f0dd6a5391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,48 +1,66 @@ -[tool.poetry] +#:schema https://json.schemastore.org/pyproject.json + +[project] name = "pyarrow-stubs" version = "10.0.1.9" description = "Type annotations for pyarrow" -authors = ["ZhengYu, Xu "] +authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" -homepage = "https://github.com/zen-xu/pyarrow-stubs" classifiers = [ - "Development Status :: 3 - Alpha", "License :: OSI Approved :: BSD License", - "Environment :: Console", - "Intended Audience :: Science/Research", - "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", - "Topic :: Scientific/Engineering", ] -packages = [{ include = "pyarrow-stubs" }] - -[tool.poetry.dependencies] -python = "^3.7" +requires-python = ">=3.8,<4" -[tool.poetry.dev-dependencies] -mypy = "^0.991" -pre-commit = ">=2.19.0" -typing-extensions = ">=4.2.0" +[project.urls] +homepage = "https://github.com/zen-xu/pyarrow-stubs" +repository = "https://github.com/zen-xu/pyarrow-stubs.git" +issues = "https://github.com/zen-xu/pyarrow-stubs/issues" [build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +build-backend = "hatchling.build" +requires = ["hatchling"] + +[tool.hatch.build.targets.wheel] +packages = ["pyarrow-stubs"] + +[tool.pixi.project] +channels = ["conda-forge"] +platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] + +[tool.pixi.pypi-dependencies] +pyarrow-stubs = { path = ".", editable = true } +pre-commit = "*" +mypy = ">=1.11" +ruff = ">=0.5" [tool.ruff] fix = true -line-length = 88 -target-version = "py37" +line-length = 99 +target-version = "py38" [tool.ruff.lint] -ignore = [ - "F811", # redefined-while-unused - "F821", # undefined-name +select = [ + "I", # isort ] + +[tool.ruff.lint.isort] +force-single-line = true +lines-after-imports = 2 +lines-between-types = 1 + +[tool.ruff.format] +docstring-code-format = true + +[tool.mypy] +explicit_package_bases = true +files = "pyarrow-stubs" +namespace_packages = true +show_error_codes = true From f56fe44c4c76e32c98892c82dbe2d2d9c4b1463c Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 Aug 2024 23:00:38 +0800 Subject: [PATCH 043/231] chore: add taplo config (#32) --- .github/workflows/lint.yaml | 21 +++++++++++++++++ pyproject.toml | 46 ++++++++++++++++++------------------- taplo.toml | 5 ++++ 3 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/lint.yaml create mode 100644 taplo.toml diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000000..7f6e71e825c --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,21 @@ +name: Lint + +on: + push: + branches: + - main + pull_request: + types: + - opened + - synchronize + +jobs: + taplo-lint: + name: taplo lint + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: uncenter/setup-taplo@v1 + with: + version: "0.9.3" + - run: taplo fmt --check diff --git a/pyproject.toml b/pyproject.toml index 1f0dd6a5391..7fe9a1fabce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,52 +7,52 @@ description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" classifiers = [ - "License :: OSI Approved :: BSD License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] requires-python = ">=3.8,<4" [project.urls] -homepage = "https://github.com/zen-xu/pyarrow-stubs" +homepage = "https://github.com/zen-xu/pyarrow-stubs" repository = "https://github.com/zen-xu/pyarrow-stubs.git" -issues = "https://github.com/zen-xu/pyarrow-stubs/issues" +issues = "https://github.com/zen-xu/pyarrow-stubs/issues" [build-system] build-backend = "hatchling.build" -requires = ["hatchling"] +requires = ["hatchling"] [tool.hatch.build.targets.wheel] packages = ["pyarrow-stubs"] [tool.pixi.project] -channels = ["conda-forge"] +channels = ["conda-forge"] platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] [tool.pixi.pypi-dependencies] pyarrow-stubs = { path = ".", editable = true } -pre-commit = "*" -mypy = ">=1.11" -ruff = ">=0.5" +pre-commit = "*" +mypy = ">=1.11" +ruff = ">=0.5" [tool.ruff] -fix = true -line-length = 99 +fix = true +line-length = 99 target-version = "py38" [tool.ruff.lint] select = [ - "I", # isort + "I", # isort ] [tool.ruff.lint.isort] -force-single-line = true +force-single-line = true lines-after-imports = 2 lines-between-types = 1 @@ -61,6 +61,6 @@ docstring-code-format = true [tool.mypy] explicit_package_bases = true -files = "pyarrow-stubs" -namespace_packages = true -show_error_codes = true +files = "pyarrow-stubs" +namespace_packages = true +show_error_codes = true diff --git a/taplo.toml b/taplo.toml new file mode 100644 index 00000000000..69418d9d7de --- /dev/null +++ b/taplo.toml @@ -0,0 +1,5 @@ +include = ["*.toml"] + +[formatting] +align_entries = true +inline_table_expand = false From b2790a0f6cbd2de827703b3848dcf545f587bd9f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 Aug 2024 23:02:45 +0800 Subject: [PATCH 044/231] chore: update LICENSE date (#33) --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index f3a65abd261..6d8e2aff5b7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2022, ZhengYu, Xu +Copyright (c) 2024, ZhengYu, Xu Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: From f1efa24488c060912dfac3fcd0e54379747bfe15 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 Aug 2024 23:06:22 +0800 Subject: [PATCH 045/231] doc: add CODE_OF_CONDUCT.md (#34) --- CODE_OF_CONDUCT.md | 128 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000000..fd680cbde25 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +. Translations are available at +. From 264b9931a74e465cde9b80938b2fe9bfd714f2df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 08:49:50 +0800 Subject: [PATCH 046/231] [pre-commit.ci] pre-commit autoupdate (#38) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7d7dc94f285..bc85e3926c1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.6 + rev: v0.5.7 hooks: - id: ruff args: [--fix] From dadeaeed6b3e09f8a46b5404db5aa3798a0df1f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:14:39 +0800 Subject: [PATCH 047/231] [pre-commit.ci] pre-commit autoupdate (#39) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.7 → v0.6.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.7...v0.6.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc85e3926c1..4669d4c6753 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.7 + rev: v0.6.1 hooks: - id: ruff args: [--fix] From aad307b795257fa48e06bf4eb7b03a6052a59df5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 12:28:11 +0800 Subject: [PATCH 048/231] [pre-commit.ci] pre-commit autoupdate (#48) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.1 → v0.6.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.1...v0.6.2) - [github.com/pre-commit/mirrors-mypy: v1.11.1 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.1...v1.11.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4669d4c6753..276a8e0315f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,14 +19,14 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.1 + rev: v0.6.2 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.1 + rev: v1.11.2 hooks: - id: mypy additional_dependencies: From 6c3611541b427374e86f58356e0d903a2e984fe4 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 12:45:13 +0800 Subject: [PATCH 049/231] refactor: rewrite type annotations by hand. (#35) * chore: restart * update ruff config * build: add extra dependencies * update mypy config * feat: add util.pyi * feat: add types.pyi * feat: impl lib.pyi * update * feat: add acero.pyi * feat: add compute.pyi * add benchmark.pyi * add cffi * feat: add csv.pyi * disable isort single line * reformat * update compute.pyi * add _auzurefs.pyi * add _cuda.pyi * add _dataset.pyi * rename _stub_typing.pyi -> _stubs_typing.pyi * add _dataset_orc.pyi * add pyarrow-stubs/_dataset_parquet_encryption.pyi * add _dataset_parquet.pyi * add _feather.pyi * feat: add _flight.pyi * add _fs.pyi * add _gcsfs.pyi * add _hdfs.pyi * add _json.pyi * add _orc.pyi * add _parquet_encryption.pyi * add _parquet.pyi * update * add _parquet.pyi * add _s3fs.pyi * add _substrait.pyi * update * update * add parquet/core.pyi * add parquet/encryption.pyi * add BufferProtocol * impl _filesystemdataset_write * add dataset.pyi * add feather.pyi * add flight.pyi * add fs.pyi * add gandiva.pyi * add json.pyi * add orc.pyi * add pandas_compat.pyi * add substrait.pyi * update util.pyi * add interchange * add __lib_pxi * update __lib_pxi * update * update * add types.pyi * feat: add scalar.pyi * update types.pyi * update types.pyi * update scalar.pyi * update * update * update * update * update * update * feat: impl array * feat: add builder.pyi * add scipy * add tensor.pyi * feat: impl NativeFile * update io.pyi * complete io.pyi * add ipc.pyi * mv benchmark.pyi into __lib_pxi * add table.pyi * do re-export in lib.pyi * fix io.pyi * update * optimize scalar.pyi * optimize indices * complete ipc.pyi * update * fix NullableIterable * fix string array * ignore overload-overlap error * fix _Tabular.__getitem__ * remove additional_dependencies --- .pre-commit-config.yaml | 4 - pixi.lock | 659 ++++- pyarrow-stubs/__init__.pyi | 860 +++++-- .../{py.typed => __lib_pxi/__init__.pyi} | 0 pyarrow-stubs/__lib_pxi/array.pyi | 1667 ++++++++++++ pyarrow-stubs/__lib_pxi/benchmark.pyi | 1 + pyarrow-stubs/__lib_pxi/builder.pyi | 25 + pyarrow-stubs/__lib_pxi/compat.pyi | 5 + pyarrow-stubs/__lib_pxi/config.pyi | 41 + pyarrow-stubs/__lib_pxi/device.pyi | 39 + pyarrow-stubs/__lib_pxi/error.pyi | 48 + pyarrow-stubs/__lib_pxi/io.pyi | 371 +++ pyarrow-stubs/__lib_pxi/ipc.pyi | 194 ++ pyarrow-stubs/__lib_pxi/memory.pyi | 40 + pyarrow-stubs/__lib_pxi/pandas_shim.pyi | 51 + pyarrow-stubs/__lib_pxi/scalar.pyi | 454 ++++ pyarrow-stubs/__lib_pxi/table.pyi | 604 +++++ pyarrow-stubs/__lib_pxi/tensor.pyi | 177 ++ pyarrow-stubs/__lib_pxi/types.pyi | 703 ++++++ pyarrow-stubs/_azurefs.pyi | 14 + pyarrow-stubs/_compute.pyi | 959 +++---- pyarrow-stubs/_compute_docstrings.pyi | 7 - pyarrow-stubs/_csv.pyi | 240 +- pyarrow-stubs/_cuda.pyi | 94 + pyarrow-stubs/_dataset.pyi | 831 +++--- pyarrow-stubs/_dataset_orc.pyi | 13 +- pyarrow-stubs/_dataset_parquet.pyi | 196 +- pyarrow-stubs/_dataset_parquet_encryption.pyi | 33 + pyarrow-stubs/_exec_plan.pyi | 23 - pyarrow-stubs/_feather.pyi | 38 +- pyarrow-stubs/_flight.pyi | 952 +++---- pyarrow-stubs/_fs.pyi | 336 +-- pyarrow-stubs/_gcsfs.pyi | 117 +- pyarrow-stubs/_generated_version.pyi | 5 - pyarrow-stubs/_hdfs.pyi | 32 +- pyarrow-stubs/_hdfsio.pyi | 69 - pyarrow-stubs/_json.pyi | 42 +- pyarrow-stubs/_orc.pyi | 94 +- pyarrow-stubs/_parquet.pyi | 590 +++-- pyarrow-stubs/_parquet_encryption.pyi | 154 +- pyarrow-stubs/_plasma.pyi | 105 - pyarrow-stubs/_s3fs.pyi | 117 +- pyarrow-stubs/_stubs_typing.pyi | 68 + pyarrow-stubs/_substrait.pyi | 30 +- pyarrow-stubs/acero.pyi | 75 + pyarrow-stubs/benchmark.pyi | 1 - pyarrow-stubs/compute.pyi | 116 +- pyarrow-stubs/csv.pyi | 11 - pyarrow-stubs/cuda.pyi | 10 - pyarrow-stubs/dataset.pyi | 293 ++- pyarrow-stubs/feather.pyi | 79 +- pyarrow-stubs/filesystem.pyi | 49 - pyarrow-stubs/flight.pyi | 140 +- pyarrow-stubs/fs.pyi | 112 +- pyarrow-stubs/gandiva.pyi | 65 + pyarrow-stubs/hdfs.pyi | 28 - pyarrow-stubs/interchange/__init__.pyi | 0 pyarrow-stubs/interchange/buffer.pyi | 22 + pyarrow-stubs/interchange/column.pyi | 62 + pyarrow-stubs/interchange/dataframe.pyi | 24 + pyarrow-stubs/interchange/from_dataframe.pyi | 49 + pyarrow-stubs/ipc.pyi | 136 +- pyarrow-stubs/json.pyi | 6 +- pyarrow-stubs/jvm.pyi | 17 - pyarrow-stubs/lib.pyi | 2235 +---------------- pyarrow-stubs/orc.pyi | 86 +- pyarrow-stubs/pandas_compat.pyi | 69 +- pyarrow-stubs/parquet/__init__.pyi | 1 - pyarrow-stubs/parquet/core.pyi | 538 ++-- pyarrow-stubs/parquet/encryption.pyi | 20 +- pyarrow-stubs/plasma.pyi | 26 - pyarrow-stubs/serialization.pyi | 16 - pyarrow-stubs/substrait.pyi | 17 +- pyarrow-stubs/types.pyi | 8 +- pyarrow-stubs/util.pyi | 24 +- pyproject.toml | 18 +- 76 files changed, 9380 insertions(+), 6075 deletions(-) rename pyarrow-stubs/{py.typed => __lib_pxi/__init__.pyi} (100%) create mode 100644 pyarrow-stubs/__lib_pxi/array.pyi create mode 100644 pyarrow-stubs/__lib_pxi/benchmark.pyi create mode 100644 pyarrow-stubs/__lib_pxi/builder.pyi create mode 100644 pyarrow-stubs/__lib_pxi/compat.pyi create mode 100644 pyarrow-stubs/__lib_pxi/config.pyi create mode 100644 pyarrow-stubs/__lib_pxi/device.pyi create mode 100644 pyarrow-stubs/__lib_pxi/error.pyi create mode 100644 pyarrow-stubs/__lib_pxi/io.pyi create mode 100644 pyarrow-stubs/__lib_pxi/ipc.pyi create mode 100644 pyarrow-stubs/__lib_pxi/memory.pyi create mode 100644 pyarrow-stubs/__lib_pxi/pandas_shim.pyi create mode 100644 pyarrow-stubs/__lib_pxi/scalar.pyi create mode 100644 pyarrow-stubs/__lib_pxi/table.pyi create mode 100644 pyarrow-stubs/__lib_pxi/tensor.pyi create mode 100644 pyarrow-stubs/__lib_pxi/types.pyi create mode 100644 pyarrow-stubs/_azurefs.pyi delete mode 100644 pyarrow-stubs/_compute_docstrings.pyi create mode 100644 pyarrow-stubs/_cuda.pyi create mode 100644 pyarrow-stubs/_dataset_parquet_encryption.pyi delete mode 100644 pyarrow-stubs/_exec_plan.pyi delete mode 100644 pyarrow-stubs/_generated_version.pyi delete mode 100644 pyarrow-stubs/_hdfsio.pyi delete mode 100644 pyarrow-stubs/_plasma.pyi create mode 100644 pyarrow-stubs/_stubs_typing.pyi create mode 100644 pyarrow-stubs/acero.pyi delete mode 100644 pyarrow-stubs/benchmark.pyi delete mode 100644 pyarrow-stubs/csv.pyi delete mode 100644 pyarrow-stubs/cuda.pyi delete mode 100644 pyarrow-stubs/filesystem.pyi create mode 100644 pyarrow-stubs/gandiva.pyi delete mode 100644 pyarrow-stubs/hdfs.pyi create mode 100644 pyarrow-stubs/interchange/__init__.pyi create mode 100644 pyarrow-stubs/interchange/buffer.pyi create mode 100644 pyarrow-stubs/interchange/column.pyi create mode 100644 pyarrow-stubs/interchange/dataframe.pyi create mode 100644 pyarrow-stubs/interchange/from_dataframe.pyi delete mode 100644 pyarrow-stubs/jvm.pyi delete mode 100644 pyarrow-stubs/plasma.pyi delete mode 100644 pyarrow-stubs/serialization.pyi diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 276a8e0315f..51a931e980a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,3 @@ repos: rev: v1.11.2 hooks: - id: mypy - additional_dependencies: - - types-cffi - - numpy - - pandas-stubs diff --git a/pixi.lock b/pixi.lock index 1d360d255cf..d7d491171fc 100644 --- a/pixi.lock +++ b/pixi.lock @@ -28,19 +28,47 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/1c/21/a6b46c91b4c9d1918ee59c305f46850cde7cbea748635a352e7c3c8ed204/mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/f3/61eeef119beb37decb58e7cb29940f19a1464b8608f2cab8a8616aba75fd/numpy-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/8e/ee/8a26858ca517e9c64f84b4c7734b89bda8e63bec85c3d2f432d225bb1886/scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . osx-64: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda @@ -56,19 +84,47 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3a/34/69638cee2e87303f19a0c35e80d42757e14d9aba328f272fdcdc0bf3c9b8/mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/64/1c/401489a7e92c30db413362756c313b9353fb47565015986c55582593e2ae/numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl - pypi: https://files.pythonhosted.org/packages/a4/10/1be32aeaab8728f78f673e7a47dd813222364479b2d6573dbcf0085e83ea/ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c0/04/2bdacc8ac6387b15db6faa40295f8bd25eccf33f1f13e68a72dc3c60a99e/scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . osx-arm64: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda @@ -84,19 +140,47 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c4/3c/3e0611348fc53a4a7c80485959478b4f6eae706baf3b7c03cafa22639216/mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/08/61/460fb524bb2d1a8bd4bbcb33d9b0971f9837fdedcfda8478d4c8f5cfd7ee/numpy-2.0.1-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/3d/1d/c218ce83beb4394ba04d05e9aa2ae6ce9fba8405688fe878b0fdb40ce855/ruff-0.5.7-py3-none-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/c8/53/35b4d41f5fd42f5781dbd0dd6c05d35ba8aa75c84ecddc7d44756cd8da2e/scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . win-64: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda @@ -114,19 +198,46 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 + - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/1e/b7/3a50f318979c8c541428c2f1ee973cda813bcc89614de982dafdd0df2b3e/mypy-1.11.1-cp312-cp312-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/59/f6ad30785a6578ad85ed9c2785f271b39c3e5b6412c66e810d2c60934c9f/numpy-2.0.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/67/1c/4520c98bfc06b9c73cd1457686d4d3935d40046b1ddea08403e5a6deff51/ruff-0.5.7-py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/aa/7d/43ab67228ef98c6b5dd42ab386eae2d7877036970a0d7e3dd3eb47a0d530/scipy-1.14.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . packages: - kind: conda @@ -160,6 +271,19 @@ packages: purls: [] size: 23621 timestamp: 1650670423406 +- kind: pypi + name: asttokens + version: 2.4.1 + url: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl + sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 + requires_dist: + - six>=1.12.0 + - typing ; python_version < '3.5' + - astroid<2,>=1 ; python_version < '3' and extra == 'astroid' + - astroid<4,>=2 ; python_version >= '3' and extra == 'astroid' + - pytest ; extra == 'test' + - astroid<2,>=1 ; python_version < '3' and extra == 'test' + - astroid<4,>=2 ; python_version >= '3' and extra == 'test' - kind: conda name: bzip2 version: 1.0.8 @@ -281,11 +405,37 @@ packages: url: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl sha256: b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 requires_python: '>=3.8' +- kind: pypi + name: colorama + version: 0.4.6 + url: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl + sha256: 4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 + requires_python: '!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7' +- kind: pypi + name: decorator + version: 5.1.1 + url: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl + sha256: b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + requires_python: '>=3.5' - kind: pypi name: distlib version: 0.3.8 url: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl sha256: 034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 +- kind: pypi + name: executing + version: 2.0.1 + url: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl + sha256: eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc + requires_dist: + - asttokens>=2.1.0 ; extra == 'tests' + - ipython ; extra == 'tests' + - pytest ; extra == 'tests' + - coverage ; extra == 'tests' + - coverage-enable-subprocess ; extra == 'tests' + - littleutils ; extra == 'tests' + - rich ; python_version >= '3.11' and extra == 'tests' + requires_python: '>=3.5' - kind: pypi name: filelock version: 3.15.4 @@ -306,6 +456,18 @@ packages: - virtualenv>=20.26.2 ; extra == 'testing' - typing-extensions>=4.8 ; python_version < '3.11' and extra == 'typing' requires_python: '>=3.8' +- kind: pypi + name: hatchling + version: 1.25.0 + url: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl + sha256: b47948e45d4d973034584dd4cb39c14b6a70227cf287ab7ec0ad7983408a882c + requires_dist: + - packaging>=23.2 + - pathspec>=0.10.1 + - pluggy>=1.0.0 + - tomli>=1.2.2 ; python_version < '3.11' + - trove-classifiers + requires_python: '>=3.8' - kind: pypi name: identify version: 2.6.0 @@ -314,6 +476,100 @@ packages: requires_dist: - ukkonen ; extra == 'license' requires_python: '>=3.8' +- kind: pypi + name: ipython + version: 8.26.0 + url: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + sha256: e6b347c27bdf9c32ee9d31ae85defc525755a1869f14057e900675b9e8d6e6ff + requires_dist: + - decorator + - jedi>=0.16 + - matplotlib-inline + - prompt-toolkit<3.1.0,>=3.0.41 + - pygments>=2.4.0 + - stack-data + - traitlets>=5.13.0 + - exceptiongroup ; python_version < '3.11' + - typing-extensions>=4.6 ; python_version < '3.12' + - pexpect>4.3 ; sys_platform != 'win32' and sys_platform != 'emscripten' + - colorama ; sys_platform == 'win32' + - ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole] ; extra == 'all' + - ipython[test,test-extra] ; extra == 'all' + - black ; extra == 'black' + - docrepr ; extra == 'doc' + - exceptiongroup ; extra == 'doc' + - intersphinx-registry ; extra == 'doc' + - ipykernel ; extra == 'doc' + - ipython[test] ; extra == 'doc' + - matplotlib ; extra == 'doc' + - setuptools>=18.5 ; extra == 'doc' + - sphinx-rtd-theme ; extra == 'doc' + - sphinx>=1.3 ; extra == 'doc' + - sphinxcontrib-jquery ; extra == 'doc' + - typing-extensions ; extra == 'doc' + - tomli ; python_version < '3.11' and extra == 'doc' + - ipykernel ; extra == 'kernel' + - matplotlib ; extra == 'matplotlib' + - nbconvert ; extra == 'nbconvert' + - nbformat ; extra == 'nbformat' + - ipywidgets ; extra == 'notebook' + - notebook ; extra == 'notebook' + - ipyparallel ; extra == 'parallel' + - qtconsole ; extra == 'qtconsole' + - pytest ; extra == 'test' + - pytest-asyncio<0.22 ; extra == 'test' + - testpath ; extra == 'test' + - pickleshare ; extra == 'test' + - packaging ; extra == 'test' + - ipython[test] ; extra == 'test-extra' + - curio ; extra == 'test-extra' + - matplotlib!=3.2.0 ; extra == 'test-extra' + - nbformat ; extra == 'test-extra' + - numpy>=1.23 ; extra == 'test-extra' + - pandas ; extra == 'test-extra' + - trio ; extra == 'test-extra' + requires_python: '>=3.10' +- kind: pypi + name: jedi + version: 0.19.1 + url: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + sha256: e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 + requires_dist: + - parso<0.9.0,>=0.8.3 + - jinja2==2.11.3 ; extra == 'docs' + - markupsafe==1.1.1 ; extra == 'docs' + - pygments==2.8.1 ; extra == 'docs' + - alabaster==0.7.12 ; extra == 'docs' + - babel==2.9.1 ; extra == 'docs' + - chardet==4.0.0 ; extra == 'docs' + - commonmark==0.8.1 ; extra == 'docs' + - docutils==0.17.1 ; extra == 'docs' + - future==0.18.2 ; extra == 'docs' + - idna==2.10 ; extra == 'docs' + - imagesize==1.2.0 ; extra == 'docs' + - mock==1.0.1 ; extra == 'docs' + - packaging==20.9 ; extra == 'docs' + - pyparsing==2.4.7 ; extra == 'docs' + - pytz==2021.1 ; extra == 'docs' + - readthedocs-sphinx-ext==2.1.4 ; extra == 'docs' + - recommonmark==0.5.0 ; extra == 'docs' + - requests==2.25.1 ; extra == 'docs' + - six==1.15.0 ; extra == 'docs' + - snowballstemmer==2.1.0 ; extra == 'docs' + - sphinx-rtd-theme==0.4.3 ; extra == 'docs' + - sphinx==1.8.5 ; extra == 'docs' + - sphinxcontrib-serializinghtml==1.1.4 ; extra == 'docs' + - sphinxcontrib-websupport==1.2.4 ; extra == 'docs' + - urllib3==1.26.4 ; extra == 'docs' + - flake8==5.0.4 ; extra == 'qa' + - mypy==0.971 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + - django ; extra == 'testing' + - attrs ; extra == 'testing' + - colorama ; extra == 'testing' + - docopt ; extra == 'testing' + - pytest<7.0.0 ; extra == 'testing' + requires_python: '>=3.6' - kind: conda name: ld_impl_linux-64 version: '2.40' @@ -666,6 +922,14 @@ packages: purls: [] size: 46921 timestamp: 1716874262512 +- kind: pypi + name: matplotlib-inline + version: 0.1.7 + url: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl + sha256: df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca + requires_dist: + - traitlets + requires_python: '>=3.8' - kind: pypi name: mypy version: 1.11.1 @@ -772,6 +1036,30 @@ packages: url: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl sha256: ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' +- kind: pypi + name: numpy + version: 2.0.1 + url: https://files.pythonhosted.org/packages/08/61/460fb524bb2d1a8bd4bbcb33d9b0971f9837fdedcfda8478d4c8f5cfd7ee/numpy-2.0.1-cp312-cp312-macosx_11_0_arm64.whl + sha256: 7d6fddc5fe258d3328cd8e3d7d3e02234c5d70e01ebe377a6ab92adb14039cb4 + requires_python: '>=3.9' +- kind: pypi + name: numpy + version: 2.0.1 + url: https://files.pythonhosted.org/packages/2c/f3/61eeef119beb37decb58e7cb29940f19a1464b8608f2cab8a8616aba75fd/numpy-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 6790654cb13eab303d8402354fabd47472b24635700f631f041bd0b65e37298a + requires_python: '>=3.9' +- kind: pypi + name: numpy + version: 2.0.1 + url: https://files.pythonhosted.org/packages/64/1c/401489a7e92c30db413362756c313b9353fb47565015986c55582593e2ae/numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl + sha256: 6bf4e6f4a2a2e26655717a1983ef6324f2664d7011f6ef7482e8c0b3d51e82ac + requires_python: '>=3.9' +- kind: pypi + name: numpy + version: 2.0.1 + url: https://files.pythonhosted.org/packages/b5/59/f6ad30785a6578ad85ed9c2785f271b39c3e5b6412c66e810d2c60934c9f/numpy-2.0.1-cp312-cp312-win_amd64.whl + sha256: bb2124fdc6e62baae159ebcfa368708867eb56806804d005860b6007388df171 + requires_python: '>=3.9' - kind: conda name: openssl version: 3.3.1 @@ -851,6 +1139,46 @@ packages: purls: [] size: 2899682 timestamp: 1721194599446 +- kind: pypi + name: packaging + version: '24.1' + url: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl + sha256: 5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 + requires_python: '>=3.8' +- kind: pypi + name: pandas-stubs + version: 2.2.2.240807 + url: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + sha256: 893919ad82be4275f0d07bb47a95d08bae580d3fdea308a7acfcb3f02e76186e + requires_dist: + - numpy>=1.23.5 + - types-pytz>=2022.1.1 + requires_python: '>=3.9' +- kind: pypi + name: parso + version: 0.8.4 + url: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + sha256: a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18 + requires_dist: + - flake8==5.0.4 ; extra == 'qa' + - mypy==0.971 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + - docopt ; extra == 'testing' + - pytest ; extra == 'testing' + requires_python: '>=3.6' +- kind: pypi + name: pathspec + version: 0.12.1 + url: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 + requires_python: '>=3.8' +- kind: pypi + name: pexpect + version: 4.9.0 + url: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 + requires_dist: + - ptyprocess>=0.5 - kind: pypi name: platformdirs version: 4.2.2 @@ -868,6 +1196,17 @@ packages: - pytest>=7.4.3 ; extra == 'test' - mypy>=1.8 ; extra == 'type' requires_python: '>=3.8' +- kind: pypi + name: pluggy + version: 1.5.0 + url: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl + sha256: 44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 + requires_dist: + - pre-commit ; extra == 'dev' + - tox ; extra == 'dev' + - pytest ; extra == 'testing' + - pytest-benchmark ; extra == 'testing' + requires_python: '>=3.8' - kind: pypi name: pre-commit version: 3.8.0 @@ -880,13 +1219,95 @@ packages: - pyyaml>=5.1 - virtualenv>=20.10.0 requires_python: '>=3.9' +- kind: pypi + name: prompt-toolkit + version: 3.0.47 + url: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + sha256: 0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10 + requires_dist: + - wcwidth + requires_python: '>=3.7.0' +- kind: pypi + name: ptyprocess + version: 0.7.0 + url: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 +- kind: pypi + name: pure-eval + version: 0.2.3 + url: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 + requires_dist: + - pytest ; extra == 'tests' +- kind: pypi + name: pyarrow + version: 17.0.0 + url: https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl + sha256: f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053 + requires_dist: + - numpy>=1.16.6 + - pytest ; extra == 'test' + - hypothesis ; extra == 'test' + - cffi ; extra == 'test' + - pytz ; extra == 'test' + - pandas ; extra == 'test' + requires_python: '>=3.8' +- kind: pypi + name: pyarrow + version: 17.0.0 + url: https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl + sha256: 392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7 + requires_dist: + - numpy>=1.16.6 + - pytest ; extra == 'test' + - hypothesis ; extra == 'test' + - cffi ; extra == 'test' + - pytz ; extra == 'test' + - pandas ; extra == 'test' + requires_python: '>=3.8' +- kind: pypi + name: pyarrow + version: 17.0.0 + url: https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl + sha256: 9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22 + requires_dist: + - numpy>=1.16.6 + - pytest ; extra == 'test' + - hypothesis ; extra == 'test' + - cffi ; extra == 'test' + - pytz ; extra == 'test' + - pandas ; extra == 'test' + requires_python: '>=3.8' +- kind: pypi + name: pyarrow + version: 17.0.0 + url: https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl + sha256: b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b + requires_dist: + - numpy>=1.16.6 + - pytest ; extra == 'test' + - hypothesis ; extra == 'test' + - cffi ; extra == 'test' + - pytz ; extra == 'test' + - pandas ; extra == 'test' + requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs version: 10.0.1.9 path: . - sha256: c0cea94bf2145eb3466967cc8adefb3ffa864d34a3a63ea52aaecab16efea076 + sha256: 5c30ac8c8008518b3a446a57a76cfde327f6fc5b7d4ab9db5deea86294d4b3b2 + requires_dist: + - pyarrow>=17 requires_python: '>=3.8,<4' editable: true +- kind: pypi + name: pygments + version: 2.18.0 + url: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl + sha256: b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a + requires_dist: + - colorama>=0.4.6 ; extra == 'windows-terminal' + requires_python: '>=3.8' - kind: conda name: python version: 3.12.5 @@ -1097,6 +1518,194 @@ packages: url: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: 8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e requires_python: '>=3.7' +- kind: pypi + name: scipy + version: 1.14.1 + url: https://files.pythonhosted.org/packages/8e/ee/8a26858ca517e9c64f84b4c7734b89bda8e63bec85c3d2f432d225bb1886/scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 + requires_dist: + - numpy<2.3,>=1.23.5 + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-xdist ; extra == 'test' + - asv ; extra == 'test' + - mpmath ; extra == 'test' + - gmpy2 ; extra == 'test' + - threadpoolctl ; extra == 'test' + - scikit-umfpack ; extra == 'test' + - pooch ; extra == 'test' + - hypothesis>=6.30 ; extra == 'test' + - array-api-strict>=2.0 ; extra == 'test' + - cython ; extra == 'test' + - meson ; extra == 'test' + - ninja ; sys_platform != 'emscripten' and extra == 'test' + - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' + - sphinx-design>=0.4.0 ; extra == 'doc' + - matplotlib>=3.5 ; extra == 'doc' + - numpydoc ; extra == 'doc' + - jupytext ; extra == 'doc' + - myst-nb ; extra == 'doc' + - pooch ; extra == 'doc' + - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' + - jupyterlite-pyodide-kernel ; extra == 'doc' + - mypy==1.10.0 ; extra == 'dev' + - typing-extensions ; extra == 'dev' + - types-psutil ; extra == 'dev' + - pycodestyle ; extra == 'dev' + - ruff>=0.0.292 ; extra == 'dev' + - cython-lint>=0.12.2 ; extra == 'dev' + - rich-click ; extra == 'dev' + - doit>=0.36.0 ; extra == 'dev' + - pydevtool ; extra == 'dev' + requires_python: '>=3.10' +- kind: pypi + name: scipy + version: 1.14.1 + url: https://files.pythonhosted.org/packages/aa/7d/43ab67228ef98c6b5dd42ab386eae2d7877036970a0d7e3dd3eb47a0d530/scipy-1.14.1-cp312-cp312-win_amd64.whl + sha256: 2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f + requires_dist: + - numpy<2.3,>=1.23.5 + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-xdist ; extra == 'test' + - asv ; extra == 'test' + - mpmath ; extra == 'test' + - gmpy2 ; extra == 'test' + - threadpoolctl ; extra == 'test' + - scikit-umfpack ; extra == 'test' + - pooch ; extra == 'test' + - hypothesis>=6.30 ; extra == 'test' + - array-api-strict>=2.0 ; extra == 'test' + - cython ; extra == 'test' + - meson ; extra == 'test' + - ninja ; sys_platform != 'emscripten' and extra == 'test' + - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' + - sphinx-design>=0.4.0 ; extra == 'doc' + - matplotlib>=3.5 ; extra == 'doc' + - numpydoc ; extra == 'doc' + - jupytext ; extra == 'doc' + - myst-nb ; extra == 'doc' + - pooch ; extra == 'doc' + - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' + - jupyterlite-pyodide-kernel ; extra == 'doc' + - mypy==1.10.0 ; extra == 'dev' + - typing-extensions ; extra == 'dev' + - types-psutil ; extra == 'dev' + - pycodestyle ; extra == 'dev' + - ruff>=0.0.292 ; extra == 'dev' + - cython-lint>=0.12.2 ; extra == 'dev' + - rich-click ; extra == 'dev' + - doit>=0.36.0 ; extra == 'dev' + - pydevtool ; extra == 'dev' + requires_python: '>=3.10' +- kind: pypi + name: scipy + version: 1.14.1 + url: https://files.pythonhosted.org/packages/c0/04/2bdacc8ac6387b15db6faa40295f8bd25eccf33f1f13e68a72dc3c60a99e/scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl + sha256: 631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d + requires_dist: + - numpy<2.3,>=1.23.5 + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-xdist ; extra == 'test' + - asv ; extra == 'test' + - mpmath ; extra == 'test' + - gmpy2 ; extra == 'test' + - threadpoolctl ; extra == 'test' + - scikit-umfpack ; extra == 'test' + - pooch ; extra == 'test' + - hypothesis>=6.30 ; extra == 'test' + - array-api-strict>=2.0 ; extra == 'test' + - cython ; extra == 'test' + - meson ; extra == 'test' + - ninja ; sys_platform != 'emscripten' and extra == 'test' + - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' + - sphinx-design>=0.4.0 ; extra == 'doc' + - matplotlib>=3.5 ; extra == 'doc' + - numpydoc ; extra == 'doc' + - jupytext ; extra == 'doc' + - myst-nb ; extra == 'doc' + - pooch ; extra == 'doc' + - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' + - jupyterlite-pyodide-kernel ; extra == 'doc' + - mypy==1.10.0 ; extra == 'dev' + - typing-extensions ; extra == 'dev' + - types-psutil ; extra == 'dev' + - pycodestyle ; extra == 'dev' + - ruff>=0.0.292 ; extra == 'dev' + - cython-lint>=0.12.2 ; extra == 'dev' + - rich-click ; extra == 'dev' + - doit>=0.36.0 ; extra == 'dev' + - pydevtool ; extra == 'dev' + requires_python: '>=3.10' +- kind: pypi + name: scipy + version: 1.14.1 + url: https://files.pythonhosted.org/packages/c8/53/35b4d41f5fd42f5781dbd0dd6c05d35ba8aa75c84ecddc7d44756cd8da2e/scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl + sha256: af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 + requires_dist: + - numpy<2.3,>=1.23.5 + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-xdist ; extra == 'test' + - asv ; extra == 'test' + - mpmath ; extra == 'test' + - gmpy2 ; extra == 'test' + - threadpoolctl ; extra == 'test' + - scikit-umfpack ; extra == 'test' + - pooch ; extra == 'test' + - hypothesis>=6.30 ; extra == 'test' + - array-api-strict>=2.0 ; extra == 'test' + - cython ; extra == 'test' + - meson ; extra == 'test' + - ninja ; sys_platform != 'emscripten' and extra == 'test' + - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' + - sphinx-design>=0.4.0 ; extra == 'doc' + - matplotlib>=3.5 ; extra == 'doc' + - numpydoc ; extra == 'doc' + - jupytext ; extra == 'doc' + - myst-nb ; extra == 'doc' + - pooch ; extra == 'doc' + - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' + - jupyterlite-pyodide-kernel ; extra == 'doc' + - mypy==1.10.0 ; extra == 'dev' + - typing-extensions ; extra == 'dev' + - types-psutil ; extra == 'dev' + - pycodestyle ; extra == 'dev' + - ruff>=0.0.292 ; extra == 'dev' + - cython-lint>=0.12.2 ; extra == 'dev' + - rich-click ; extra == 'dev' + - doit>=0.36.0 ; extra == 'dev' + - pydevtool ; extra == 'dev' + requires_python: '>=3.10' +- kind: pypi + name: six + version: 1.16.0 + url: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + sha256: 8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' +- kind: pypi + name: stack-data + version: 0.6.3 + url: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 + requires_dist: + - executing>=1.2.0 + - asttokens>=2.1.0 + - pure-eval + - pytest ; extra == 'tests' + - typeguard ; extra == 'tests' + - pygments ; extra == 'tests' + - littleutils ; extra == 'tests' + - cython ; extra == 'tests' - kind: conda name: tk version: 8.6.13 @@ -1164,6 +1773,47 @@ packages: purls: [] size: 3318875 timestamp: 1699202167581 +- kind: pypi + name: traitlets + version: 5.14.3 + url: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f + requires_dist: + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx ; extra == 'docs' + - argcomplete>=3.0.3 ; extra == 'test' + - mypy>=1.7.0 ; extra == 'test' + - pre-commit ; extra == 'test' + - pytest-mock ; extra == 'test' + - pytest-mypy-testing ; extra == 'test' + - pytest<8.2,>=7.0 ; extra == 'test' + requires_python: '>=3.8' +- kind: pypi + name: trove-classifiers + version: 2024.7.2 + url: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + sha256: ccc57a33717644df4daca018e7ec3ef57a835c48e96a1e71fc07eb7edac67af6 +- kind: pypi + name: types-cffi + version: 1.16.0.20240331 + url: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl + sha256: a363e5ea54a4eb6a4a105d800685fde596bc318089b025b27dee09849fe41ff0 + requires_dist: + - types-setuptools + requires_python: '>=3.8' +- kind: pypi + name: types-pytz + version: 2024.1.0.20240417 + url: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl + sha256: 8335d443310e2db7b74e007414e74c4f53b67452c0cb0d228ca359ccfba59659 + requires_python: '>=3.8' +- kind: pypi + name: types-setuptools + version: 71.1.0.20240806 + url: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl + sha256: 3bd8dd02039be0bb79ad880d8893b8eefcb022fabbeeb61245c61b20c9ab1ed0 + requires_python: '>=3.8' - kind: pypi name: typing-extensions version: 4.12.2 @@ -1280,6 +1930,13 @@ packages: purls: [] size: 17395 timestamp: 1717709043353 +- kind: pypi + name: wcwidth + version: 0.2.13 + url: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl + sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 + requires_dist: + - backports-functools-lru-cache>=1.2.1 ; python_version < '3.2' - kind: conda name: xz version: 5.2.6 diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index bdd73b09522..6f1edbc6d5e 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -1,268 +1,608 @@ -from typing import Any +# ruff: noqa: F401, I001, E402 +__version__: str -from pyarrow._hdfsio import HdfsFile as HdfsFile -from pyarrow._hdfsio import have_libhdfs as have_libhdfs -from pyarrow.ipc import Message as Message -from pyarrow.ipc import MessageReader as MessageReader -from pyarrow.ipc import MetadataVersion as MetadataVersion -from pyarrow.ipc import RecordBatchFileReader as RecordBatchFileReader -from pyarrow.ipc import RecordBatchFileWriter as RecordBatchFileWriter -from pyarrow.ipc import RecordBatchStreamReader as RecordBatchStreamReader -from pyarrow.ipc import RecordBatchStreamWriter as RecordBatchStreamWriter -from pyarrow.ipc import deserialize_pandas as deserialize_pandas -from pyarrow.ipc import serialize_pandas as serialize_pandas -from pyarrow.lib import NA as NA -from pyarrow.lib import Array as Array -from pyarrow.lib import ArrowCancelled as ArrowCancelled -from pyarrow.lib import ArrowCapacityError as ArrowCapacityError -from pyarrow.lib import ArrowException as ArrowException -from pyarrow.lib import ArrowIndexError as ArrowIndexError -from pyarrow.lib import ArrowInvalid as ArrowInvalid -from pyarrow.lib import ArrowIOError as ArrowIOError -from pyarrow.lib import ArrowKeyError as ArrowKeyError -from pyarrow.lib import ArrowMemoryError as ArrowMemoryError -from pyarrow.lib import ArrowNotImplementedError as ArrowNotImplementedError -from pyarrow.lib import ArrowSerializationError as ArrowSerializationError -from pyarrow.lib import ArrowTypeError as ArrowTypeError -from pyarrow.lib import BaseExtensionType as BaseExtensionType -from pyarrow.lib import BinaryArray as BinaryArray -from pyarrow.lib import BinaryScalar as BinaryScalar -from pyarrow.lib import BooleanArray as BooleanArray -from pyarrow.lib import BooleanScalar as BooleanScalar -from pyarrow.lib import Buffer as Buffer -from pyarrow.lib import BufferedInputStream as BufferedInputStream -from pyarrow.lib import BufferedOutputStream as BufferedOutputStream -from pyarrow.lib import BufferOutputStream as BufferOutputStream -from pyarrow.lib import BufferReader as BufferReader -from pyarrow.lib import BuildInfo as BuildInfo -from pyarrow.lib import ChunkedArray as ChunkedArray -from pyarrow.lib import Codec as Codec -from pyarrow.lib import CompressedInputStream as CompressedInputStream -from pyarrow.lib import CompressedOutputStream as CompressedOutputStream -from pyarrow.lib import DataType as DataType -from pyarrow.lib import Date32Array as Date32Array -from pyarrow.lib import Date32Scalar as Date32Scalar -from pyarrow.lib import Date64Array as Date64Array -from pyarrow.lib import Date64Scalar as Date64Scalar -from pyarrow.lib import Decimal128Array as Decimal128Array -from pyarrow.lib import Decimal128Scalar as Decimal128Scalar -from pyarrow.lib import Decimal128Type as Decimal128Type -from pyarrow.lib import Decimal256Array as Decimal256Array -from pyarrow.lib import Decimal256Scalar as Decimal256Scalar -from pyarrow.lib import Decimal256Type as Decimal256Type -from pyarrow.lib import DenseUnionType as DenseUnionType -from pyarrow.lib import DeserializationCallbackError as DeserializationCallbackError -from pyarrow.lib import DictionaryArray as DictionaryArray -from pyarrow.lib import DictionaryMemo as DictionaryMemo -from pyarrow.lib import DictionaryScalar as DictionaryScalar -from pyarrow.lib import DictionaryType as DictionaryType -from pyarrow.lib import DoubleScalar as DoubleScalar -from pyarrow.lib import DurationArray as DurationArray -from pyarrow.lib import DurationScalar as DurationScalar -from pyarrow.lib import DurationType as DurationType -from pyarrow.lib import ExtensionArray as ExtensionArray -from pyarrow.lib import ExtensionScalar as ExtensionScalar -from pyarrow.lib import ExtensionType as ExtensionType -from pyarrow.lib import Field as Field -from pyarrow.lib import FixedSizeBinaryArray as FixedSizeBinaryArray -from pyarrow.lib import FixedSizeBinaryScalar as FixedSizeBinaryScalar -from pyarrow.lib import FixedSizeBinaryType as FixedSizeBinaryType -from pyarrow.lib import FixedSizeBufferWriter as FixedSizeBufferWriter -from pyarrow.lib import FixedSizeListArray as FixedSizeListArray -from pyarrow.lib import FixedSizeListScalar as FixedSizeListScalar -from pyarrow.lib import FixedSizeListType as FixedSizeListType -from pyarrow.lib import FloatingPointArray as FloatingPointArray -from pyarrow.lib import FloatScalar as FloatScalar -from pyarrow.lib import HalfFloatScalar as HalfFloatScalar -from pyarrow.lib import Int8Array as Int8Array -from pyarrow.lib import Int8Scalar as Int8Scalar -from pyarrow.lib import Int16Array as Int16Array -from pyarrow.lib import Int16Scalar as Int16Scalar -from pyarrow.lib import Int32Array as Int32Array -from pyarrow.lib import Int32Scalar as Int32Scalar -from pyarrow.lib import Int64Array as Int64Array -from pyarrow.lib import Int64Scalar as Int64Scalar -from pyarrow.lib import IntegerArray as IntegerArray -from pyarrow.lib import KeyValueMetadata as KeyValueMetadata -from pyarrow.lib import LargeBinaryArray as LargeBinaryArray -from pyarrow.lib import LargeBinaryScalar as LargeBinaryScalar -from pyarrow.lib import LargeListArray as LargeListArray -from pyarrow.lib import LargeListScalar as LargeListScalar -from pyarrow.lib import LargeListType as LargeListType -from pyarrow.lib import LargeStringArray as LargeStringArray -from pyarrow.lib import LargeStringScalar as LargeStringScalar -from pyarrow.lib import ListArray as ListArray -from pyarrow.lib import ListScalar as ListScalar -from pyarrow.lib import ListType as ListType -from pyarrow.lib import LoggingMemoryPool as LoggingMemoryPool -from pyarrow.lib import MapArray as MapArray -from pyarrow.lib import MapScalar as MapScalar -from pyarrow.lib import MapType as MapType -from pyarrow.lib import MemoryMappedFile as MemoryMappedFile -from pyarrow.lib import MemoryPool as MemoryPool -from pyarrow.lib import MockOutputStream as MockOutputStream -from pyarrow.lib import MonthDayNano as MonthDayNano -from pyarrow.lib import MonthDayNanoIntervalArray as MonthDayNanoIntervalArray -from pyarrow.lib import MonthDayNanoIntervalScalar as MonthDayNanoIntervalScalar -from pyarrow.lib import NativeFile as NativeFile -from pyarrow.lib import NullArray as NullArray -from pyarrow.lib import NullScalar as NullScalar -from pyarrow.lib import NumericArray as NumericArray -from pyarrow.lib import OSFile as OSFile -from pyarrow.lib import ProxyMemoryPool as ProxyMemoryPool -from pyarrow.lib import PyExtensionType as PyExtensionType -from pyarrow.lib import PythonFile as PythonFile -from pyarrow.lib import RecordBatch as RecordBatch -from pyarrow.lib import RecordBatchReader as RecordBatchReader -from pyarrow.lib import ResizableBuffer as ResizableBuffer -from pyarrow.lib import RuntimeInfo as RuntimeInfo -from pyarrow.lib import Scalar as Scalar -from pyarrow.lib import Schema as Schema -from pyarrow.lib import SerializationCallbackError as SerializationCallbackError -from pyarrow.lib import SparseCOOTensor as SparseCOOTensor -from pyarrow.lib import SparseCSCMatrix as SparseCSCMatrix -from pyarrow.lib import SparseCSFTensor as SparseCSFTensor -from pyarrow.lib import SparseCSRMatrix as SparseCSRMatrix -from pyarrow.lib import SparseUnionType as SparseUnionType -from pyarrow.lib import StringArray as StringArray -from pyarrow.lib import StringScalar as StringScalar -from pyarrow.lib import StructArray as StructArray -from pyarrow.lib import StructScalar as StructScalar -from pyarrow.lib import StructType as StructType -from pyarrow.lib import Table as Table -from pyarrow.lib import TableGroupBy as TableGroupBy -from pyarrow.lib import Tensor as Tensor -from pyarrow.lib import Time32Array as Time32Array -from pyarrow.lib import Time32Scalar as Time32Scalar -from pyarrow.lib import Time32Type as Time32Type -from pyarrow.lib import Time64Array as Time64Array -from pyarrow.lib import Time64Scalar as Time64Scalar -from pyarrow.lib import Time64Type as Time64Type -from pyarrow.lib import TimestampArray as TimestampArray -from pyarrow.lib import TimestampScalar as TimestampScalar -from pyarrow.lib import TimestampType as TimestampType -from pyarrow.lib import TransformInputStream as TransformInputStream -from pyarrow.lib import UInt8Array as UInt8Array -from pyarrow.lib import UInt8Scalar as UInt8Scalar -from pyarrow.lib import UInt16Array as UInt16Array -from pyarrow.lib import UInt16Scalar as UInt16Scalar -from pyarrow.lib import UInt32Array as UInt32Array -from pyarrow.lib import UInt32Scalar as UInt32Scalar -from pyarrow.lib import UInt64Array as UInt64Array -from pyarrow.lib import UInt64Scalar as UInt64Scalar -from pyarrow.lib import UnionArray as UnionArray -from pyarrow.lib import UnionScalar as UnionScalar -from pyarrow.lib import UnionType as UnionType -from pyarrow.lib import UnknownExtensionType as UnknownExtensionType -from pyarrow.lib import VersionInfo as VersionInfo -from pyarrow.lib import allocate_buffer as allocate_buffer -from pyarrow.lib import array as array -from pyarrow.lib import binary as binary -from pyarrow.lib import bool_ as bool_ -from pyarrow.lib import chunked_array as chunked_array -from pyarrow.lib import compress as compress -from pyarrow.lib import concat_arrays as concat_arrays -from pyarrow.lib import concat_tables as concat_tables -from pyarrow.lib import cpp_build_info as cpp_build_info -from pyarrow.lib import cpp_version as cpp_version -from pyarrow.lib import cpp_version_info as cpp_version_info -from pyarrow.lib import cpu_count as cpu_count -from pyarrow.lib import create_memory_map as create_memory_map -from pyarrow.lib import date32 as date32 -from pyarrow.lib import date64 as date64 -from pyarrow.lib import decimal128 as decimal128 -from pyarrow.lib import decimal256 as decimal256 -from pyarrow.lib import decompress as decompress -from pyarrow.lib import default_memory_pool as default_memory_pool -from pyarrow.lib import dense_union as dense_union -from pyarrow.lib import deserialize as deserialize -from pyarrow.lib import deserialize_components as deserialize_components -from pyarrow.lib import deserialize_from as deserialize_from -from pyarrow.lib import dictionary as dictionary -from pyarrow.lib import duration as duration -from pyarrow.lib import enable_signal_handlers as enable_signal_handlers -from pyarrow.lib import field as field -from pyarrow.lib import float16 as float16 -from pyarrow.lib import float32 as float32 -from pyarrow.lib import float64 as float64 -from pyarrow.lib import foreign_buffer as foreign_buffer -from pyarrow.lib import from_numpy_dtype as from_numpy_dtype -from pyarrow.lib import infer_type as infer_type -from pyarrow.lib import input_stream as input_stream -from pyarrow.lib import int8 as int8 -from pyarrow.lib import int16 as int16 -from pyarrow.lib import int32 as int32 -from pyarrow.lib import int64 as int64 -from pyarrow.lib import io_thread_count as io_thread_count -from pyarrow.lib import jemalloc_memory_pool as jemalloc_memory_pool -from pyarrow.lib import jemalloc_set_decay_ms as jemalloc_set_decay_ms -from pyarrow.lib import large_binary as large_binary -from pyarrow.lib import large_list as large_list -from pyarrow.lib import large_string as large_string -from pyarrow.lib import large_utf8 as large_utf8 -from pyarrow.lib import list_ as list_ -from pyarrow.lib import log_memory_allocations as log_memory_allocations -from pyarrow.lib import logging_memory_pool as logging_memory_pool -from pyarrow.lib import map_ as map_ -from pyarrow.lib import memory_map as memory_map -from pyarrow.lib import mimalloc_memory_pool as mimalloc_memory_pool -from pyarrow.lib import month_day_nano_interval as month_day_nano_interval -from pyarrow.lib import null as null -from pyarrow.lib import nulls as nulls -from pyarrow.lib import output_stream as output_stream -from pyarrow.lib import proxy_memory_pool as proxy_memory_pool -from pyarrow.lib import py_buffer as py_buffer -from pyarrow.lib import read_serialized as read_serialized -from pyarrow.lib import record_batch as record_batch -from pyarrow.lib import register_extension_type as register_extension_type -from pyarrow.lib import repeat as repeat -from pyarrow.lib import runtime_info as runtime_info -from pyarrow.lib import scalar as scalar -from pyarrow.lib import schema as schema -from pyarrow.lib import serialize as serialize -from pyarrow.lib import serialize_to as serialize_to -from pyarrow.lib import set_cpu_count as set_cpu_count -from pyarrow.lib import set_io_thread_count as set_io_thread_count -from pyarrow.lib import set_memory_pool as set_memory_pool -from pyarrow.lib import sparse_union as sparse_union -from pyarrow.lib import string as string -from pyarrow.lib import struct as struct -from pyarrow.lib import supported_memory_backends as supported_memory_backends -from pyarrow.lib import system_memory_pool as system_memory_pool -from pyarrow.lib import table as table -from pyarrow.lib import time32 as time32 -from pyarrow.lib import time64 as time64 -from pyarrow.lib import timestamp as timestamp -from pyarrow.lib import total_allocated_bytes as total_allocated_bytes -from pyarrow.lib import transcoding_input_stream as transcoding_input_stream -from pyarrow.lib import type_for_alias as type_for_alias -from pyarrow.lib import uint8 as uint8 -from pyarrow.lib import uint16 as uint16 -from pyarrow.lib import uint32 as uint32 -from pyarrow.lib import uint64 as uint64 -from pyarrow.lib import unify_schemas as unify_schemas -from pyarrow.lib import union as union -from pyarrow.lib import unregister_extension_type as unregister_extension_type -from pyarrow.lib import utf8 as utf8 -from pyarrow.serialization import default_serialization_context as default_serialization_context -from pyarrow.serialization import ( - register_default_serialization_handlers as register_default_serialization_handlers, -) -from pyarrow.serialization import ( - register_torch_serialization_handlers as register_torch_serialization_handlers, -) +import pyarrow.lib as _lib -from . import filesystem as filesystem -from . import hdfs as hdfs -from . import ipc as ipc -from . import serialization as serialization -from . import types as types -from . import util as util +_gc_enabled: bool + +from pyarrow.lib import ( + BuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) def show_versions() -> None: ... def show_info() -> None: ... -def __getattr__(name: str) -> Any: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + fixed_shape_tensor, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + PyExtensionType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, +) + +# Buffers, allocation +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) def get_include() -> str: ... -def get_libraries() -> tuple[str, str]: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... def create_library_symlinks() -> None: ... def get_library_dirs() -> list[str]: ... + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "fixed_shape_tensor", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "get_library_dirs", +] diff --git a/pyarrow-stubs/py.typed b/pyarrow-stubs/__lib_pxi/__init__.pyi similarity index 100% rename from pyarrow-stubs/py.typed rename to pyarrow-stubs/__lib_pxi/__init__.pyi diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi new file mode 100644 index 00000000000..16aed96cd66 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -0,0 +1,1667 @@ +# mypy: disable-error-code="overload-overlap" + +import datetime as dt + +from collections.abc import Callable +from decimal import Decimal +from typing import ( + Any, + Generic, + Iterable, + Iterator, + Literal, + Self, + TypeAlias, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.base import ExtensionDtype +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, +) +from pyarrow.lib import Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable + +from . import scalar, types +from .device import DeviceAllocationType +from .scalar import Scalar +from .types import ( + DataType, + Field, + MapType, + _AsPyType, + _BasicDataType, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, + _ValueT, +) + +_T = TypeVar("_T") + +NullableIterable: TypeAlias = Iterable[_T | None] + +@overload # type: ignore[overload-overlap] +def array( + values: NullableIterable[bool], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: NullableIterable[int], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: NullableIterable[float], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: NullableIterable[Decimal], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Decimal128Array: ... +@overload +def array( + values: NullableIterable[dict[str, Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def array( + values: NullableIterable[dt.date], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: NullableIterable[dt.time], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array: ... +@overload +def array( + values: NullableIterable[dt.timedelta], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray: ... +@overload +def array( + values: NullableIterable[MonthDayNano], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: NullableIterable[str], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: NullableIterable[bytes], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: NullableIterable[list], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ListArray: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[Scalar[_DataTypeT]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.NullScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.BooleanScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Int8Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Int16Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Int32Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Int64Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.UInt8Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.UInt16Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.UInt32Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.UInt64Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.HalfFloatScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.FloatScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.DoubleScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.StringScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.BinaryScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.LargeStringScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.LargeBinaryScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.BinaryViewScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.StringViewScalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Date32Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Date64Scalar]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ns]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def array( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def asarray(values: NullableIterable[bool]) -> BooleanArray: ... +@overload +def asarray(values: NullableIterable[int]) -> Int64Array: ... +@overload +def asarray(values: NullableIterable[float]) -> DoubleArray: ... +@overload +def asarray(values: NullableIterable[Decimal]) -> Decimal128Array: ... +@overload +def asarray(values: NullableIterable[dict[str, Any]]) -> StructArray: ... +@overload +def asarray(values: NullableIterable[dt.date]) -> Date32Array: ... +@overload +def asarray(values: NullableIterable[dt.time]) -> Time64Array: ... +@overload +def asarray(values: NullableIterable[dt.timedelta]) -> DurationArray: ... +@overload +def asarray(values: NullableIterable[MonthDayNano]) -> MonthDayNanoIntervalArray: ... +@overload +def asarray(values: NullableIterable[list]) -> ListArray: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, +) -> Array[Scalar[_DataTypeT]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["null"] +) -> Array[scalar.NullScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"], +) -> Array[scalar.BooleanScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i1", "int8"] +) -> Array[scalar.Int8Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i2", "int16"] +) -> Array[scalar.Int16Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i4", "int32"] +) -> Array[scalar.Int32Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i8", "int64"] +) -> Array[scalar.Int64Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u1", "uint8"] +) -> Array[scalar.UInt8Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u2", "uint16"] +) -> Array[scalar.UInt16Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u4", "uint32"] +) -> Array[scalar.UInt32Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u8", "uint64"] +) -> Array[scalar.UInt64Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"], +) -> Array[scalar.HalfFloatScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"], +) -> Array[scalar.FloatScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"], +) -> Array[scalar.DoubleScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"], +) -> Array[scalar.StringScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary"] +) -> Array[scalar.BinaryScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"], +) -> Array[scalar.LargeStringScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_binary"] +) -> Array[scalar.LargeBinaryScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary_view"] +) -> Array[scalar.BinaryViewScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string_view"] +) -> Array[scalar.StringViewScalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"], +) -> Array[scalar.Date32Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"], +) -> Array[scalar.Date64Scalar]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time32[s]"] +) -> Array[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time32[ms]"] +) -> Array[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time64[us]"] +) -> Array[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time64[ns]"] +) -> Array[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[s]"] +) -> Array[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[ms]"] +) -> Array[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[us]"] +) -> Array[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[ns]"] +) -> Array[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[s]"] +) -> Array[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[ms]"] +) -> Array[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[us]"] +) -> Array[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[ns]"] +) -> Array[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def asarray( + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"], +) -> Array[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... +@overload +def nulls( + size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def nulls( + size: int, type: types.BoolType, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... +@overload +def nulls( + size: int, types: types.Int16Type, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def nulls( + size: int, types: types.Int32Type, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def nulls( + size: int, types: types.Int64Type, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def nulls( + size: int, types: types.Uint8Type, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def nulls( + size: int, types: types.Uint16Type, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def nulls( + size: int, types: types.Uint32Type, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def nulls( + size: int, types: types.Uint64Type, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def nulls( + size: int, types: types.Float16Type, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def nulls( + size: int, types: types.Float32Type, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def nulls( + size: int, types: types.Float64Type, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def nulls( + size: int, types: types.Decimal128Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, types: types.Decimal256Type, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def nulls( + size: int, types: types.Date32Type, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def nulls( + size: int, types: types.Date64Type, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def nulls( + size: int, types: types.Time32Type, memory_pool: MemoryPool | None = None +) -> Time32Array: ... +@overload +def nulls( + size: int, types: types.Time64Type, memory_pool: MemoryPool | None = None +) -> Time64Array: ... +@overload +def nulls( + size: int, types: types.TimestampType, memory_pool: MemoryPool | None = None +) -> TimestampArray: ... +@overload +def nulls( + size: int, types: types.DurationType, memory_pool: MemoryPool | None = None +) -> DurationArray: ... +@overload +def nulls( + size: int, types: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None +) -> MonthDayNanoIntervalArray: ... +@overload +def nulls( + size: int, + types: types.BinaryType, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def nulls( + size: int, + types: types.LargeBinaryType, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def nulls( + size: int, + types: types.FixedSizeBinaryType, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def nulls( + size: int, + types: types.StringType, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def nulls( + size: int, + types: types.LargeStringType, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def nulls( + size: int, + types: types.BinaryViewType, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def nulls( + size: int, + types: types.StringViewType, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def nulls( + size: int, + types: types.ListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def nulls( + size: int, + types: types.FixedSizeListType[_DataTypeT, _Size], + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def nulls( + size: int, + types: types.LargeListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + types: types.ListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + types: types.LargeListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + types: types.StructType, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def nulls( + size: int, + types: types.MapType[_MapKeyT, _MapItemT], + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def nulls( + size: int, + types: types.DictionaryType[_IndexT, _ValueT], + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _ValueT]: ... +@overload +def nulls( + size: int, + types: types.RunEndEncodedType[_RunEndType, _ValueT], + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _ValueT]: ... +@overload +def nulls( + size: int, + types: types.UnionType, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def nulls( + size: int, + types: types.FixedShapeTensorType, + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray: ... +@overload +def nulls( + size: int, + types: types.ExtensionType, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray: ... +@overload +def repeat( + value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def repeat( # type: ignore[overload-overlap] + value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def repeat( + value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int8Array: ... +@overload +def repeat( + value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def repeat( + value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def repeat( + value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def repeat( + value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def repeat( + value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def repeat( + value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def repeat( + value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def repeat( + value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def repeat( + value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def repeat( + value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def repeat( + value: Decimal | scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def repeat( + value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def repeat( + value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def repeat( + value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def repeat( + value: scalar.Time32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Time32Array: ... +@overload +def repeat( + value: dt.time | scalar.Time64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Time64Array: ... +@overload +def repeat( + value: scalar.TimestampScalar, size: int, memory_pool: MemoryPool | None = None +) -> TimestampArray: ... +@overload +def repeat( + value: dt.timedelta | scalar.DurationScalar, size: int, memory_pool: MemoryPool | None = None +) -> DurationArray: ... +@overload +def repeat( # type: ignore[overload-overlap] + value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def repeat( + value: bytes | scalar.BinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def repeat( + value: scalar.LargeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def repeat( + value: scalar.FixedSizeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def repeat( + value: str | scalar.StringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def repeat( + value: scalar.LargeStringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def repeat( + value: scalar.BinaryViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def repeat( + value: scalar.StringViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def repeat( + value: list | tuple | scalar.ListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def repeat( + value: scalar.FixedSizeListScalar[_DataTypeT, _Size], + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def repeat( + value: scalar.LargeListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.ListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.LargeListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: dict[str, Any] | scalar.StructScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def repeat( + value: scalar.MapScalar[_MapKeyT, _MapItemT], + size: int, + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def repeat( + value: scalar.DictionaryScalar[_IndexT, _ValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _ValueT]: ... +@overload +def repeat( + value: scalar.RunEndEncodedScalar[_RunEndType, _ValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _ValueT]: ... +@overload +def repeat( + value: scalar.UnionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def repeat( + value: scalar.FixedShapeTensorScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray: ... +@overload +def repeat( + value: scalar.ExtensionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray: ... +def infer_type(values: Iterable, mask: Mask, from_pandas: bool = False) -> DataType: ... + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: ... + +_CastAs = TypeVar("_CastAs", bound=DataType) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + +class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): + def diff(self, other: Self) -> str: ... + def cast( + self, + target_type: _CastAs, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: ... + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + def sum(self, **kwargs) -> _ScalarT: ... + def unique(self) -> Self: ... + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar]: ... + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: list[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_ScalarT]: ... + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + format = to_string + def equals(self, other: Self) -> bool: ... + def __len__(self) -> int: ... + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + def is_nan(self) -> BooleanArray: ... + def is_valid(self) -> BooleanArray: ... + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType], Any]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType], Any]]: ... + @overload + def __getitem__(self, key: int) -> _ScalarT: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + def take(self, indices: Indices) -> Self: ... + def drop_null(self) -> Self: ... + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: ... + @overload + def index( + self, + value: _ScalarT, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + @overload + def index( + self: Array[Scalar[_BasicDataType[_AsPyType], Any]], + value: _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: ... + def to_pylist( + self: Array[Scalar[_BasicDataType[_AsPyType], Any]], + ) -> list[_AsPyType | None]: ... + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: ... + @property + def offset(self) -> int: ... + def buffers(self) -> list[Buffer | None]: ... + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: ... + def __dlpack_device__(self) -> tuple[int, int]: ... + @property + def device_type(self) -> DeviceAllocationType: ... + @property + def is_cpu(self) -> bool: ... + +class NullArray(Array[scalar.NullScalar]): ... + +class BooleanArray(Array[scalar.BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + +class NumericArray(Array[_ScalarT]): ... +class IntegerArray(NumericArray[_ScalarT]): ... +class FloatingPointArray(NumericArray[_ScalarT]): ... +class Int8Array(IntegerArray[scalar.Int8Scalar]): ... +class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... +class Int16Array(IntegerArray[scalar.Int16Scalar]): ... +class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... +class Int32Array(IntegerArray[scalar.Int32Scalar]): ... +class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... +class Int64Array(IntegerArray[scalar.Int64Scalar]): ... +class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... +class Date32Array(NumericArray[scalar.Date32Scalar]): ... +class Date64Array(NumericArray[scalar.Date64Scalar]): ... +class TimestampArray(NumericArray[scalar.TimestampScalar]): ... +class Time32Array(NumericArray[scalar.Time32Scalar]): ... +class Time64Array(NumericArray[scalar.Time64Scalar]): ... +class DurationArray(NumericArray[scalar.DurationScalar]): ... +class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... +class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... +class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... +class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... +class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... +class Decimal128Array(FixedSizeBinaryArray): ... +class Decimal256Array(FixedSizeBinaryArray): ... + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + def value_parent_indices(self) -> Int64Array: ... + def value_lengths(self) -> Int32Array: ... + +class ListArray(BaseListArray[_ScalarT]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @property + def values(self) -> Array: ... + @property + def offsets(self) -> Int32Array: ... + +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @property + def values(self) -> Array: ... + @property + def offsets(self) -> Int64Array: ... + +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @property + def values(self) -> Array: ... + @property + def offsets(self) -> Int32Array: ... + @property + def sizes(self) -> Int32Array: ... + +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @property + def values(self) -> Array: ... + @property + def offsets(self) -> Int64Array: ... + @property + def sizes(self) -> Int64Array: ... + +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, None]: ... + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + limit_size: _Size, + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size]: ... + @property + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: ... + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + +class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): + @overload # type: ignore[override] + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + keys: Array[Scalar[_MapKeyT]], + items: Array[Scalar[_MapItemT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: MapType[_MapKeyT, _MapItemT], + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @property + def keys(self) -> Array: ... + @property + def items(self) -> Array: ... + +class UnionArray(Array[scalar.UnionScalar]): + def child(self, pos: int) -> Field: ... + def field(self, pos: int) -> Array: ... + @property + def type_codes(self) -> Int8Array: ... + @property + def offsets(self) -> Int32Array: ... + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: list[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: ... + @staticmethod + def from_sparse( + types: Int8Array, + children: list[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: ... + +class StringArray(Array[scalar.StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + +class LargeStringArray(Array[scalar.LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + +class StringViewArray(Array[scalar.StringViewScalar]): ... + +class BinaryArray(Array[scalar.BinaryScalar]): + @property + def total_values_length(self) -> int: ... + +class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): + @property + def total_values_length(self) -> int: ... + +class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... + +class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _ValueT]]): + @staticmethod + def from_buffers( # type: ignore[override] + type: _ValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _ValueT]: ... + @staticmethod + def from_arrays( + indices: Indices, + dictionary: Array | np.ndarray | pd.Series, + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: ... + +class StructArray(Array[scalar.StructScalar]): + def field(self, index: int | str) -> Array: ... + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + @staticmethod + def from_arrays( + arrays: Iterable[Array], + names: list[str] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + ) -> StructArray: ... + def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: ... + +class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _ValueT]]): + @overload + @staticmethod + def from_arrays( + run_ends: Int16Array, + values: Array, + type: _ValueT | None = None, + ) -> RunEndEncodedArray[types.Int16Type, _ValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int32Array, + values: Array, + type: _ValueT | None = None, + ) -> RunEndEncodedArray[types.Int32Type, _ValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int64Array, + values: Array, + type: _ValueT | None = None, + ) -> RunEndEncodedArray[types.Int64Type, _ValueT]: ... + @staticmethod + def from_buffers( # type: ignore[override] + type: _ValueT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | None = None, + ) -> RunEndEncodedArray[Any, _ValueT]: ... + @property + def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: ... + @property + def values(self) -> Array[scalar.Scalar[_ValueT]]: ... + def find_physical_offset(self) -> int: ... + def find_physical_length(self) -> int: ... + +_ArrayT = TypeVar("_ArrayT", bound=Array) + +class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + @staticmethod + def from_storage( + typ: types.BaseExtensionType, storage: _ArrayT + ) -> ExtensionArray[_ArrayT]: ... + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + def to_numpy_ndarray(self) -> np.ndarray: ... + def to_tensor(self) -> Tensor: ... + @staticmethod + def from_numpy_ndarray(obj: np.ndarray) -> FixedShapeTensorArray: ... + +def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: ... +def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: ... + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", +] diff --git a/pyarrow-stubs/__lib_pxi/benchmark.pyi b/pyarrow-stubs/__lib_pxi/benchmark.pyi new file mode 100644 index 00000000000..66981bf0f51 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/benchmark.pyi @@ -0,0 +1 @@ +def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/pyarrow-stubs/__lib_pxi/builder.pyi b/pyarrow-stubs/__lib_pxi/builder.pyi new file mode 100644 index 00000000000..27a0a954dcc --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/builder.pyi @@ -0,0 +1,25 @@ +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + +class StringBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): ... + def append_values(self, values: Iterable[str | bytes | None]): ... + def finish(self) -> StringArray: ... + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +class StringViewBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): ... + def append_values(self, values: Iterable[str | bytes | None]): ... + def finish(self) -> StringViewArray: ... + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/pyarrow-stubs/__lib_pxi/compat.pyi b/pyarrow-stubs/__lib_pxi/compat.pyi new file mode 100644 index 00000000000..ae667be453e --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/compat.pyi @@ -0,0 +1,5 @@ +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/pyarrow-stubs/__lib_pxi/config.pyi b/pyarrow-stubs/__lib_pxi/config.pyi new file mode 100644 index 00000000000..166e10c9734 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/config.pyi @@ -0,0 +1,41 @@ +from typing import NamedTuple + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/pyarrow-stubs/__lib_pxi/device.pyi b/pyarrow-stubs/__lib_pxi/device.pyi new file mode 100644 index 00000000000..ed999541a49 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/device.pyi @@ -0,0 +1,39 @@ +import enum + +from pyarrow.lib import _Weakrefable + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + +class Device(_Weakrefable): + @property + def type_name(self) -> str: ... + @property + def device_id(self) -> int: ... + @property + def is_cpu(self) -> bool: ... + @property + def device_type(self) -> DeviceAllocationType: ... + +class MemoryManager(_Weakrefable): + @property + def device(self) -> Device: ... + @property + def is_cpu(self) -> bool: ... + +def default_cpu_memory_manager() -> MemoryManager: ... + +__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/pyarrow-stubs/__lib_pxi/error.pyi b/pyarrow-stubs/__lib_pxi/error.pyi new file mode 100644 index 00000000000..92ec0e3f0bb --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/error.pyi @@ -0,0 +1,48 @@ +from typing import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi new file mode 100644 index 00000000000..f2d483a41ee --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -0,0 +1,371 @@ +from collections.abc import Callable +from io import IOBase +from os import PathLike +from typing import Any, Literal, Self, SupportsIndex, TypeAlias, overload + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from .types import KeyValueMetadata + +def have_libhdfs() -> bool: ... +def io_thread_count() -> int: ... +def set_io_thread_count(count: int) -> None: ... + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: ... + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: ... + def metadata(self) -> KeyValueMetadata: ... + def tell(self) -> int: ... + def seek(self, position: int, whence: int = 0) -> int: ... + def flush(self) -> None: ... + def write(self, data: bytes | SupportPyBuffer) -> int: ... + def read( + self, + ) -> bytes: ... + def get_stream(self, file_offset: int, nbytes: int) -> Self: ... + def read_at(self) -> bytes: ... + def read1(self) -> bytes: ... + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: ... + def readline(self, size: int | None = None) -> bytes: ... + def readlines(self, hint: int | None = None) -> list[bytes]: ... + def __iter__(self) -> Self: ... + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + def truncate(self) -> None: ... + def writelines(self, lines: list[bytes]): ... + def download(self, stream_or_path: str | PathLike, buffer_size: int | None = None) -> None: ... + def upload(self, stream: str | PathLike, buffer_size: int | None) -> None: ... + +# ---------------------------------------------------------------------- +# Python file-like objects + +class PythonFile(NativeFile): + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: ... + +class MemoryMappedFile(NativeFile): + @classmethod + def create(cls, path: str, size: int) -> Self: ... + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + def resize(self, new_size: int) -> None: ... + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: ... + +create_memory_map = MemoryMappedFile.create + +class OSFile(NativeFile): + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"], + memory_pool: MemoryPool | None = None, + ) -> None: ... + +class FixedSizeBufferWriter(NativeFile): + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + def set_memcopy_threshold(self, threshold: int) -> None: ... + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + def __len__(self) -> int: ... + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: ... + @property + def address(self) -> int: ... + def hex(self) -> bytes: ... + @property + def is_mutable(self) -> bool: ... + @property + def is_cpu(self) -> bool: ... + @property + def device(self) -> Device: ... + @property + def memory_manager(self) -> MemoryManager: ... + @property + def device_type(self) -> DeviceAllocationType: ... + @property + def parent(self) -> Buffer | None: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> int: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + def equals(self, other: Self) -> bool: ... + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: ... + +class ResizableBuffer(Buffer): + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + +@overload +def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[False] +) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[True] +) -> ResizableBuffer: ... + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: ... + +class MockOutputStream(NativeFile): ... + +class BufferReader(NativeFile): + def __init__(self, obj) -> None: ... + +class CompressedInputStream(NativeFile): + def __init__( + self, + stream: str | PathLike | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class CompressedOutputStream(NativeFile): + def __init__( + self, + stream: str | PathLike | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class BufferedInputStream(NativeFile): + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: ... + +class BufferedOutputStream(NativeFile): + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: ... + +class TransformInputStream(NativeFile): + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: ... +def py_buffer(obj: SupportPyBuffer) -> Buffer: ... +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + +class CacheOptions(_Weakrefable): + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: ... + +class Codec(_Weakrefable): + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + @classmethod + def detect(cls, path: str | PathLike) -> Self: ... + @staticmethod + def is_available(compression: Compression) -> bool: ... + @staticmethod + def supports_compression_level(compression: Compression) -> int: ... + @staticmethod + def default_compression_level(compression: Compression) -> int: ... + @staticmethod + def minimum_compression_level(compression: Compression) -> int: ... + @staticmethod + def maximum_compression_level(compression: Compression) -> int: ... + @property + def name(self) -> Compression: ... + @property + def compression_level(self) -> int: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def input_stream( + source: str | PathLike | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> BufferReader: ... +def output_stream( + source: str | PathLike | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> NativeFile: ... + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi new file mode 100644 index 00000000000..e1e7a7688df --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -0,0 +1,194 @@ +import enum + +from io import IOBase +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple, Self + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from .types import DictionaryMemo, KeyValueMetadata + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + +class WriteStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class ReadStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class IpcReadOptions(_Weakrefable): + ensure_native_endian: bool + use_threads: bool + include_fields: list + def __init__( + self, *, ensure_native_endian: bool = True, use_threads: bool = True, include_fields: list + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + +class Message(_Weakrefable): + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): ... + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: ... + +class MessageReader(_Weakrefable): + @classmethod + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: ... + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: ... + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + +class _CRecordBatchWriter(_Weakrefable): + def write(self, table_or_batch: Table | RecordBatch): ... + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): ... + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + def close(self) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: ... + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + def __dealloc__(self) -> None: ... + def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: ... + +class RecordBatchReader(_Weakrefable): + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: ... + __next__ = read_next_batch + @property + def schema(self) -> Schema: ... + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: ... + def read_all(self) -> Table: ... + read_pandas = _ReadPandasMixin.read_pandas + def close(self) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: ... + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + def __arrow_c_stream__(self, requested_schema=None): ... + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + @classmethod + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: ... + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: ... + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: ... + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... + +class RecordBatchWithMetadata(NamedTuple): + batch: RecordBatch + custom_metadata: KeyValueMetadata + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: ... + def get_batch(self, i: int) -> RecordBatch: ... + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + def read_all(self) -> Table: ... + read_pandas = _ReadPandasMixin.read_pandas + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> ReadStats: ... + +def get_tensor_size(tensor: Tensor) -> int: ... +def get_record_batch_size(batch: RecordBatch) -> int: ... +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... +def read_tensor(source: NativeFile) -> Tensor: ... +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... +def read_schema( + obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None +) -> Schema: ... +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: ... + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/pyarrow-stubs/__lib_pxi/memory.pyi b/pyarrow-stubs/__lib_pxi/memory.pyi new file mode 100644 index 00000000000..cf98e88c9ae --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/memory.pyi @@ -0,0 +1,40 @@ +from pyarrow.lib import _Weakrefable + +class MemoryPool(_Weakrefable): + def release_unused(self) -> None: ... + def bytes_allocated(self) -> int: ... + def max_memory(self) -> int | None: ... + @property + def backend_name(self) -> str: ... + +class LoggingMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): ... + +def default_memory_pool() -> MemoryPool: ... +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... +def system_memory_pool() -> MemoryPool: ... +def jemalloc_memory_pool() -> MemoryPool: ... +def mimalloc_memory_pool() -> MemoryPool: ... +def set_memory_pool(pool: MemoryPool) -> None: ... +def log_memory_allocations(enable: bool = True) -> None: ... +def total_allocated_bytes() -> int: ... +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... +def supported_memory_backends() -> list[str]: ... + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi new file mode 100644 index 00000000000..1a57d7ca238 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi @@ -0,0 +1,51 @@ +# mypy: disable-error-code="name-defined" +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +import pandas as pd + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> pd.Series: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[pd.Categorical]: ... + @property + def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi new file mode 100644 index 00000000000..12fafd623bf --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -0,0 +1,454 @@ +# mypy: disable-error-code="overload-overlap" +import collections.abc +import datetime as dt + +from decimal import Decimal +from typing import Any, Generic, Iterator, Literal, Mapping, Self, TypeAlias, overload + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from typing_extensions import TypeVar + +from . import types +from .types import _AsPyType, _DataTypeT, _NewDataTypeT, _Time32Unit, _Time64Unit, _Tz, _Unit + +_IsValid = TypeVar("_IsValid", default=Literal[True]) +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") + +class Scalar(_Weakrefable, Generic[_DataTypeT, _IsValid]): + @property + def type(self) -> _DataTypeT: ... + @property + def is_valid(self) -> _IsValid: ... + @overload + def cast( + self, + target_type: None, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + @overload + def cast( + self, + target_type: _NewDataTypeT, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Scalar[_NewDataTypeT, _IsValid]: ... + def validate(self, *, full: bool = False) -> None: ... + def equals(self, other: Scalar) -> bool: ... + def __hash__(self) -> int: ... + @overload + def as_py(self: Scalar[types._BasicDataType[_AsPyType], Literal[True]]) -> _AsPyType: ... + @overload + def as_py( + self: Scalar[types.ListType[types._BasicDataType[_AsPyType]], Literal[True]], + ) -> list[_AsPyType]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[ + types.DictionaryType[ + types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV], Any + ] + ], + Literal[True], + ], + ) -> list[dict[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], + Literal[True], + ], + ) -> list[dict[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[types.DictionaryType[types._BasicDataType[_AsPyTypeK], Any, Any]], + Literal[True], + ], + ) -> list[dict[_AsPyTypeK, Any]]: ... + @overload + def as_py( + self: Scalar[types.StructType, Literal[True]], + ) -> list[dict[str, Any]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]], + Literal[True], + ], + ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[Any, types._BasicDataType[_AsPyTypeV]], + Literal[True], + ], + ) -> list[tuple[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], Any], + Literal[True], + ], + ) -> list[tuple[_AsPyTypeK, Any]]: ... + @overload + def as_py(self: Scalar[Any, Literal[True]]) -> Any: ... + @overload + def as_py(self: Scalar[Any, Literal[False]]) -> None: ... + +_NULL: TypeAlias = None +NA = _NULL + +class NullScalar(Scalar[types.NullType, _IsValid]): ... +class BooleanScalar(Scalar[types.BoolType, _IsValid]): ... +class UInt8Scalar(Scalar[types.Uint8Type, _IsValid]): ... +class Int8Scalar(Scalar[types.Int8Type, _IsValid]): ... +class UInt16Scalar(Scalar[types.Uint16Type, _IsValid]): ... +class Int16Scalar(Scalar[types.Int16Type, _IsValid]): ... +class UInt32Scalar(Scalar[types.Uint32Type, _IsValid]): ... +class Int32Scalar(Scalar[types.Int32Type, _IsValid]): ... +class UInt64Scalar(Scalar[types.Uint64Type, _IsValid]): ... +class Int64Scalar(Scalar[types.Int64Type, _IsValid]): ... +class HalfFloatScalar(Scalar[types.Float16Type, _IsValid]): ... +class FloatScalar(Scalar[types.Float32Type, _IsValid]): ... +class DoubleScalar(Scalar[types.Float64Type, _IsValid]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type, _IsValid]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type, _IsValid]): ... +class Date32Scalar(Scalar[types.Date32Type, _IsValid]): ... + +class Date64Scalar(Scalar[types.Date64Type, _IsValid]): + @property + def value(self) -> dt.date | None: ... + +class Time32Scalar(Scalar[types.Time32Type[_Time32Unit], _IsValid]): + @property + def value(self) -> dt.time | None: ... + +class Time64Scalar(Scalar[types.Time64Type[_Time64Unit], _IsValid]): + @property + def value(self) -> dt.time | None: ... + +class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz], _IsValid]): + @property + def value(self) -> int | None: ... + +class DurationScalar(Scalar[types.DurationType[_Unit], _IsValid]): + @property + def value(self) -> dt.timedelta | None: ... + +class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType, _IsValid]): + @property + def value(self) -> MonthDayNano | None: ... + +class BinaryScalar(Scalar[types.BinaryType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class LargeBinaryScalar(Scalar[types.LargeBinaryType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class StringScalar(Scalar[types.StringType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class LargeStringScalar(Scalar[types.LargeStringType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class BinaryViewScalar(Scalar[types.BinaryViewType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class StringViewScalar(Scalar[types.StringViewType, _IsValid]): + def as_buffer(self) -> Buffer: ... + +class ListScalar(Scalar[types.ListType[_DataTypeT], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __iter__(self) -> Iterator[Array]: ... + +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __iter__(self) -> Iterator[Array]: ... + +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __iter__(self) -> Iterator[Array]: ... + +class StructScalar(Scalar[types.StructType, _IsValid], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[str]: ... + def __getitem__(self, __key: str) -> Scalar[Any, _IsValid]: ... # type: ignore[override] + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + +class MapScalar(Scalar[types.MapType[types._K, types._ValueT], _IsValid]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__( + self, i: int + ) -> tuple[Scalar[types._K, _IsValid], types._ValueT, Any, _IsValid]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]], + _IsValid, + ], + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[Any, types._BasicDataType[_AsPyTypeV]], + _IsValid, + ], + ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], Any], + _IsValid, + ], + ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... + +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._ValueT], _IsValid]): + @property + def index(self) -> Scalar[types._IndexT, _IsValid]: ... + @property + def value(self) -> Scalar[types._ValueT, _IsValid]: ... + @property + def dictionary(self) -> Array: ... + +class RunEndEncodedScalar( + Scalar[types.RunEndEncodedType[types._RunEndType, types._ValueT], _IsValid] +): + @property + def value(self) -> tuple[int, int] | None: ... + +class UnionScalar(Scalar[types.UnionType, _IsValid]): + @property + def value(self) -> Any | None: ... + @property + def type_code(self) -> str: ... + +class ExtensionScalar(Scalar[types.ExtensionType, _IsValid]): + @property + def value(self) -> Any | None: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... + +class FixedShapeTensorScalar(ExtensionScalar[_IsValid]): + def to_numpy(self) -> np.ndarray: ... + def to_tensor(self) -> Tensor: ... + +_V = TypeVar("_V") + +CollectionValue: TypeAlias = list[_V | None] | tuple[_V | None, ...] | set[_V | None] + +@overload +def scalar( + value: str, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> StringScalar: ... +@overload +def scalar( + value: bytes, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> BinaryScalar: ... +@overload +def scalar( + value: bool, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> BooleanScalar: ... +@overload +def scalar( + value: int, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> Int64Scalar: ... +@overload +def scalar( + value: float, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> DoubleScalar: ... +@overload +def scalar( + value: Decimal, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> Decimal128Scalar: ... +@overload +def scalar( + value: dt.date, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> Date32Scalar: ... +@overload +def scalar( + value: dt.time, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> Time64Scalar: ... +@overload +def scalar( + value: dt.timedelta, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> DurationScalar: ... +@overload +def scalar( # type: ignore[overload-overlap] + value: MonthDayNano, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Mapping[str, Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: CollectionValue[str], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.StringType]]: ... +@overload +def scalar( + value: CollectionValue[bytes], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BinaryType]]: ... +@overload +def scalar( # type: ignore[overload-overlap] + value: CollectionValue[bool], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BoolType]]: ... +@overload +def scalar( + value: CollectionValue[int], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Int64Type]]: ... +@overload +def scalar( + value: CollectionValue[float], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Float64Type]]: ... +@overload +def scalar( + value: CollectionValue[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal128Type]]: ... +@overload +def scalar( + value: CollectionValue[dt.date], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Date32Type]]: ... +@overload +def scalar( + value: CollectionValue[dt.time], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Time32Type]]: ... +@overload +def scalar( + value: CollectionValue[dt.timedelta], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.DurationType]]: ... +@overload +def scalar( + value: CollectionValue[MonthDayNano], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... +@overload +def scalar( + value: CollectionValue[_V], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[Any]: ... +@overload +def scalar( + value: _V, + type: _DataTypeT, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT, _V]: ... + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "scalar", +] diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi new file mode 100644 index 00000000000..10cc70f105f --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -0,0 +1,604 @@ +# mypy: disable-error-code="overload-overlap" + +import datetime as dt + +from decimal import Decimal +from typing import ( + Any, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Self, + TypeAlias, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from pyarrow._compute import CastOptions, FunctionOptions +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Field, MemoryPool, MonthDayNano, Schema + +from . import scalar +from .array import Array, NullableIterable, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from .ipc import RecordBatchReader +from .scalar import Int64Scalar, Scalar +from .tensor import Tensor +from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): + @property + def data(self) -> Self: ... + @property + def type(self) -> DataType: ... + def length(self) -> int: ... + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + format = to_string + def validate(self, *, full: bool = False) -> None: ... + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> _ScalarT: ... + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: ... + def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: ... + def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: ... + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: ... + def equals(self, other: Self) -> bool: ... + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + @overload + def cast( + self, + target_type: None = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self: ... + @overload + def cast( + self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None + ) -> ChunkedArray[Scalar[_CastAs]]: ... + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> ChunkedArray[_ScalarT]: ... + def unique(self) -> ChunkedArray[_ScalarT]: ... + def value_counts(self) -> StructArray: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop"): ... + @overload + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType], Any]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + @overload + def index( + self, + value: Scalar[_DataTypeT], + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + def take(self, indices: Indices) -> Self: ... + def drop_null(self) -> Self: ... + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + @property + def num_chunks(self) -> int: ... + def chunk(self, i: int) -> ChunkedArray[_ScalarT]: ... + @property + def chunks(self) -> list[Array[_ScalarT]]: ... + def iterchunks(self) -> Generator[Array[_ScalarT], None, None]: ... + def __iter__(self) -> Iterator[Array[_ScalarT]]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType], Any]], + ) -> list[_AsPyType | None]: ... + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + +@overload # type: ignore[overload-overlap] +def chunked_array( + values: NullableIterable[bool], + type: None = None, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: NullableIterable[int], + type: None = None, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: NullableIterable[float], + type: None = None, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: NullableIterable[Decimal], + type: None = None, +) -> ChunkedArray[scalar.Decimal128Scalar]: ... +@overload +def chunked_array( + values: NullableIterable[dict[str, Any]], + type: None = None, +) -> ChunkedArray[scalar.StructScalar]: ... +@overload +def chunked_array( + values: NullableIterable[dt.date], + type: None = None, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: NullableIterable[dt.time], + type: None = None, +) -> ChunkedArray[scalar.Time64Scalar]: ... +@overload +def chunked_array( + values: NullableIterable[dt.timedelta], + type: None = None, +) -> ChunkedArray[scalar.DurationScalar]: ... +@overload +def chunked_array( + values: NullableIterable[MonthDayNano], + type: None = None, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: NullableIterable[str], + type: None = None, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: NullableIterable[bytearray], + type: None = None, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: NullableIterable[list], + type: None = None, +) -> ChunkedArray[scalar.ListScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: _DataTypeT, +) -> ChunkedArray[Scalar[_DataTypeT]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["null"], +) -> ChunkedArray[scalar.NullScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["bool", "boolean"], +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["i1", "int8"], +) -> ChunkedArray[scalar.Int8Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["i2", "int16"], +) -> ChunkedArray[scalar.Int16Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["i4", "int32"], +) -> ChunkedArray[scalar.Int32Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["i8", "int64"], +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["u1", "uint8"], +) -> ChunkedArray[scalar.UInt8Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["u2", "uint16"], +) -> ChunkedArray[scalar.UInt16Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["u4", "uint32"], +) -> ChunkedArray[scalar.UInt32Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["u8", "uint64"], +) -> ChunkedArray[scalar.UInt64Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["f2", "halffloat", "float16"], +) -> ChunkedArray[scalar.HalfFloatScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["f4", "float", "float32"], +) -> ChunkedArray[scalar.FloatScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["f8", "double", "float64"], +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["string", "str", "utf8"], +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["binary"], +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["large_string", "large_str", "large_utf8"], +) -> ChunkedArray[scalar.LargeStringScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["large_binary"], +) -> ChunkedArray[scalar.LargeBinaryScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["binary_view"], +) -> ChunkedArray[scalar.BinaryViewScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["string_view"], +) -> ChunkedArray[scalar.StringViewScalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["date32", "date32[day]"], +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["date64", "date64[ms]"], +) -> ChunkedArray[scalar.Date64Scalar]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["time32[s]"], +) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["time32[ms]"], +) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["time64[us]"], +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["time64[ns]"], +) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["timestamp[s]"], +) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["timestamp[ms]"], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["timestamp[us]"], +) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["timestamp[ns]"], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["duration[s]"], +) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["duration[ms]"], +) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["duration[us]"], +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["duration[ns]"], +) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable | SupportArrowStream | SupportArrowArray, + type: Literal["month_day_nano_interval"], +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... + +_ColumnT = TypeVar("_ColumnT", bound=Array | ChunkedArray) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + @overload + def __getitem__(self, key: int | str) -> _ColumnT: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: ... + @property + def column_names(self) -> list[str]: ... + @property + def columns(self) -> list[_ColumnT]: ... + def drop_null(self) -> Self: ... + def field(self, i: int | str) -> Field: ... + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, Array | list], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: ... + @classmethod + def from_pylist( + cls, + mapping: list[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: ... + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: ... + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: Order | list[tuple[str, Order]], **kwargs) -> Self: ... + def take(self, indices: Indices) -> Self: ... + def filter( + self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: ... + def to_pydict(self) -> dict[str, list]: ... + def to_pylist(self) -> list[dict[str, Any]]: ... + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: ... + def add_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + def append_column(self, field_: str | Field, column: Array | list) -> Self: ... + +class RecordBatch(_Tabular[Array]): + def validate(self, *, full: bool = False) -> None: ... + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: ... + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + def rename_columns(self, names: dict[str, str]) -> Self: ... + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + def equals(self, other: Self, check_metadata: bool = False) -> bool: ... + def select(self, columns: list[str] | Indices) -> Self: ... + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: ... + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: ... + def to_struct_array(self) -> StructArray: ... + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: ... + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + def __arrow_c_array__(self, requested_schema=None): ... + def __arrow_c_stream__(self, requested_schema=None): ... + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + @property + def device_type(self) -> DeviceAllocationType: ... + @property + def is_cpu(self) -> bool: ... + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray]): + def validate(self, *, full=False) -> None: ... + def slice(self, offset=0, length=None) -> Self: ... + def select(self, columns: list[str] | Indices) -> Self: ... + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: ... + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + def equals(self, other: Self, check_metadata: bool = False) -> Self: ... + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: ... + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: ... + @classmethod + def from_arrays( + cls, + arrays: list[Array] | list[ChunkedArray], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: ... + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: ... + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[scalar.StructScalar]: ... + @classmethod + def from_batches( + cls, batches: Iterable[RecordBatch], schema: Schema | None = None + ) -> Self: ... + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + def get_total_buffer_size(self) -> int: ... + def __sizeof__(self) -> int: ... + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + def rename_columns(self, names: dict[str, str]) -> Self: ... + def drop(self, columns: str | list[str]) -> Self: ... + def group_by(self, keys: str | list[str], use_threads: bool = True) -> Self: ... + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: ... + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: ... + def __arrow_c_stream__(self, requested_schema=None): ... + +def record_batch( + data: dict[str, list | Array] + | list[Array] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, +) -> RecordBatch: ... +def table( + data: dict[str, list | Array] + | list[Array | ChunkedArray] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + nthreads: int | None = None, +) -> Table: ... +def concat_tables( + tables: list[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "premissive"] = "none", + **kwargs, +) -> Table: ... + +class TableGroupBy: + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, aggregations: list[tuple[str, str]] | list[tuple[str, str, FunctionOptions]] + ) -> Table: ... + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", +] diff --git a/pyarrow-stubs/__lib_pxi/tensor.pyi b/pyarrow-stubs/__lib_pxi/tensor.pyi new file mode 100644 index 00000000000..a23414ef9fd --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/tensor.pyi @@ -0,0 +1,177 @@ +# mypy: disable-error-code="import-untyped" + +from typing import Self + +import numpy as np + +from pyarrow.lib import _Weakrefable +from pydata.sparse import COO # type: ignore[import-not-found] +from scipy.sparse import coo_matrix, csr_matrix + +class Tensor(_Weakrefable): + @classmethod + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + def to_numpy(self) -> np.ndarray: ... + def equals(self, other: Tensor) -> bool: ... + def dim_name(self, i: int) -> str: ... + @property + def dim_names(self) -> list[str]: ... + @property + def is_mutable(self) -> bool: ... + @property + def is_contiguous(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + @property + def strides(self) -> tuple[int, ...]: ... + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + def to_scipy(self) -> coo_matrix: ... + def to_pydata_sparse(self) -> COO: ... + def to_tensor(self) -> Tensor: ... + def equals(self, other: Self) -> bool: ... + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + +class SparseCSRMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + def to_scipy(self) -> csr_matrix: ... + def to_tensor(self) -> Tensor: ... + def equals(self, other: Self) -> bool: ... + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSCMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + def to_scipy(self) -> csr_matrix: ... + def to_tensor(self) -> Tensor: ... + def equals(self, other: Self) -> bool: ... + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSFTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + def to_tensor(self) -> Tensor: ... + def equals(self, other: Self) -> bool: ... + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi new file mode 100644 index 00000000000..d24b148af86 --- /dev/null +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -0,0 +1,703 @@ +import datetime as dt + +from collections.abc import Mapping +from decimal import Decimal +from typing import Any, Generic, Iterable, Iterator, Literal, Self, TypeAlias, overload + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar + +from .scalar import ExtensionScalar + +CSchema: TypeAlias = Any + +class _Weakrefable: ... +class _Metadata(_Weakrefable): ... + +class DataType(_Weakrefable): + def field(self, i: int) -> Field: ... + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: ... + @property + def byte_width(self) -> int: ... + @property + def num_fields(self) -> int: ... + @property + def num_buffers(self) -> int: ... + def __hash__(self) -> int: ... + def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: ... + def to_pandas_dtype(self) -> np.generic: ... + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + def __arrow_c_schema__(self) -> Any: ... + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + +_AsPyType = TypeVar("_AsPyType") + +class _BasicDataType(DataType, Generic[_AsPyType]): ... +class NullType(_BasicDataType[None]): ... +class BoolType(_BasicDataType[bool]): ... +class Uint8Type(_BasicDataType[int]): ... +class Int8Type(_BasicDataType[int]): ... +class Uint16Type(_BasicDataType[int]): ... +class Int16Type(_BasicDataType[int]): ... +class Uint32Type(_BasicDataType[int]): ... +class Int32Type(_BasicDataType[int]): ... +class Uint64Type(_BasicDataType[int]): ... +class Int64Type(_BasicDataType[int]): ... +class Float16Type(_BasicDataType[float]): ... +class Float32Type(_BasicDataType[float]): ... +class Float64Type(_BasicDataType[float]): ... +class Date32Type(_BasicDataType[dt.date]): ... +class Date64Type(_BasicDataType[dt.date]): ... +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... +class StringType(_BasicDataType[str]): ... +class LargeStringType(_BasicDataType[str]): ... +class StringViewType(_BasicDataType[str]): ... +class BinaryType(_BasicDataType[bytes]): ... +class LargeBinaryType(_BasicDataType[bytes]): ... +class BinaryViewType(_BasicDataType[bytes]): ... + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"]) +_Tz = TypeVar("_Tz", str, None, default=None) + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + @property + def unit(self) -> _Unit: ... + @property + def tz(self) -> _Tz: ... + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + @property + def unit(self) -> _Time32Unit: ... + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + @property + def unit(self) -> _Time64Unit: ... + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + @property + def unit(self) -> _Unit: ... + +class FixedSizeBinaryType(_BasicDataType[Decimal]): ... + +_Precision = TypeVar("_Precision") +_Scale = TypeVar("_Scale") + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + @property + def scale(self) -> _Scale: ... + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + @property + def scale(self) -> _Scale: ... + +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + +class ListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + +class LargeListType(ListType[_DataTypeT]): ... +class ListViewType(ListType[_DataTypeT]): ... +class LargeListViewType(ListType[_DataTypeT]): ... + +class FixedSizeListType(ListType[_DataTypeT], Generic[_DataTypeT, _Size]): + @property + def list_size(self) -> _Size: ... + +class DictionaryMemo(_Weakrefable): ... + +_IndexT = TypeVar("_IndexT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=_BasicDataType) +_Ordered = TypeVar("_Ordered", bound=Literal[True, False], default=Literal[False]) + +class DictionaryType(DataType, Generic[_IndexT, _ValueT, _Ordered]): + @property + def ordered(self) -> _Ordered: ... + @property + def index_type(self) -> _IndexT: ... + @property + def value_type(self) -> _ValueT: ... + +_K = TypeVar("_K", bound=_BasicDataType) + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + @property + def key_field(self) -> Field[_K, Literal[False]]: ... + @property + def key_type(self) -> _K: ... + @property + def item_field(self) -> Field[_ValueT]: ... + @property + def item_type(self) -> _ValueT: ... + @property + def keys_sorted(self) -> _Ordered: ... + +_Size = TypeVar("_Size") + +class StructType(DataType): + def get_field_index(self, name: str) -> int: ... + def field(self, i: int | str) -> Field: ... + def get_all_field_indices(self, name: str) -> list[int]: ... + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + __getitem__ = field + +class UnionType(DataType): + @property + def mode(self) -> Literal["sparse", "dense"]: ... + @property + def type_codes(self) -> list[int]: ... + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + def field(self, i: int) -> Field: ... + __getitem__ = field + +class SparseUnionType(UnionType): + @property + def mode(self) -> Literal["sparse"]: ... + +class DenseUnionType(UnionType): + @property + def mode(self) -> Literal["dense"]: ... + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + +class RunEndEncodedType(DataType, Generic[_RunEndType, _ValueT]): + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _ValueT: ... + +class BaseExtensionType(DataType): + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + @property + def extension_name(self) -> str: ... + @property + def storage_type(self) -> DataType: ... + @overload + def wrap_array(self, storage: Array) -> Array: ... + @overload + def wrap_array(self, storage: ChunkedArray) -> ChunkedArray: ... + +class ExtensionType(BaseExtensionType): + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + def __arrow_ext_serialize__(self) -> bytes: ... + @classmethod + def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: ... + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + @property + def value_type(self) -> _ValueT: ... + @property + def shape(self) -> list[int]: ... + @property + def dim_names(self) -> list[str] | None: ... + @property + def permutation(self) -> list[int] | None: ... + +class PyExtensionType(ExtensionType): + def __init__(self, storage_type: DataType) -> None: ... + @classmethod + def set_auto_load(cls, value: bool) -> None: ... + +class UnknownExtensionType(PyExtensionType): + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + +def register_extension_type(ext_type: PyExtensionType) -> None: ... +def unregister_extension_type(type_name: str) -> None: ... + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... + def equals(self, other: KeyValueMetadata) -> bool: ... + def __len__(self) -> int: ... + def __contains__(self, __key: object) -> bool: ... + def __getitem__(self, __key: Any) -> Any: ... + def __iter__(self) -> Iterator[bytes]: ... + def get_all(self, key: str) -> list[bytes]: ... + def to_dict(self) -> dict[bytes, bytes]: ... + +def ensure_metadata( + meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False +) -> KeyValueMetadata | None: ... + +_NewDataTypeT = TypeVar("_NewDataTypeT", bound=DataType) +_Nullable = TypeVar("_Nullable", bound=Literal[True, False], default=Literal[True]) + +class Field(_Weakrefable, Generic[_DataTypeT, _Nullable]): + def equals(self, other: Field, check_metadata: bool = False) -> bool: ... + def __hash__(self) -> int: ... + @property + def nullable(self) -> _Nullable: ... + @property + def name(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... + def remove_metadata(self) -> None: ... + def with_type(self, new_type: _NewDataTypeT) -> Field[_NewDataTypeT]: ... + def with_name(self, name: str) -> Self: ... + def with_nullable(self, nullable: _Nullable) -> Field[_DataTypeT, _Nullable]: ... + def flatten(self) -> list[Field]: ... + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + def __arrow_c_schema__(self) -> Any: ... + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + +class Schema(_Weakrefable): + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ + def __iter__(self) -> Iterator[Field]: ... + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: ... + @property + def names(self) -> list[str]: ... + @property + def types(self) -> list[DataType]: ... + @property + def metadata(self) -> dict[bytes, bytes]: ... + def empty_table(self) -> Table: ... + def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: ... + def field(self, i: str | bytes) -> Field: ... + def field_by_name(self, name: str) -> Field: ... + def get_field_index(self, name: str) -> int: ... + def get_all_field_indices(self, name: str) -> list[int]: ... + def append(self, field: Field) -> Schema: ... + def insert(self, i: int, field: Field) -> Schema: ... + def remove(self, i: int) -> Schema: ... + def set(self, i: int, field: Field) -> Schema: ... + def add_metadata(self, metadata: dict) -> Schema: ... + def with_metadata(self, metadata: dict) -> Schema: ... + def serialize(self, memory_pool: MemoryPool | None = None): ... + def remove_metadata(self) -> Schema: ... + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: ... + def _export_to_c(self, out_ptr: int) -> None: ... + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: ... + def __arrow_c_schema__(self) -> Any: ... + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: ... + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: ... +@overload +def field(name: SupportArrowSchema) -> Field: ... +@overload +def field( + name: str, + type: _DataTypeT, +) -> Field[_DataTypeT, Literal[True]]: ... +@overload +def field( + name: str, type: _DataTypeT, nullable: _Nullable, metadata: dict | None = None +) -> Field[_DataTypeT, _Nullable]: ... +def null() -> NullType: ... +def bool_() -> BoolType: ... +def uint8() -> Uint8Type: ... +def int8() -> Int8Type: ... +def uint16() -> Uint16Type: ... +def int16() -> Int16Type: ... +def uint32() -> Uint32Type: ... +def int32() -> Int32Type: ... +def int64() -> Int64Type: ... +def uint64() -> Uint64Type: ... +def tzinfo_to_string(tz: dt.tzinfo) -> str: ... +def string_to_tzinfo(name: str) -> dt.tzinfo: ... +@overload +def timestamp(unit: _Unit) -> TimestampType[_Unit, None]: ... +@overload +def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: ... +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: ... +def duration(unit: _Unit) -> DurationType[_Unit]: ... +def month_day_nano_interval() -> MonthDayNanoIntervalType: ... +def date32() -> Date32Type: ... +def date64() -> Date64Type: ... +def float16() -> Float16Type: ... +def float32() -> Float32Type: ... +def float64() -> Float64Type: ... +@overload +def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... +@overload +def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... +@overload +def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... +@overload +def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... +def string() -> StringType: ... + +utf8 = string + +@overload +def binary() -> BinaryType: ... +@overload +def binary(length: Literal[-1]) -> BinaryType: ... # type: ignore[overload-overlap] +@overload +def binary(length: int) -> FixedSizeBinaryType: ... +def large_binary() -> LargeBinaryType: ... +def large_string() -> LargeStringType: ... + +large_utf8 = large_string + +def binary_view() -> BinaryViewType: ... +def string_view() -> StringViewType: ... +@overload +def list_(value_type: Field[_DataTypeT]) -> ListType[_DataTypeT]: ... +@overload +def list_(value_type: _DataTypeT) -> ListType[_DataTypeT]: ... +@overload +def list_(value_type: _DataTypeT, list_size: Literal[-1]) -> ListType[_DataTypeT]: ... # type: ignore[overload-overlap] +@overload +def list_(value_type: _DataTypeT, list_size: _Size) -> FixedSizeListType[_DataTypeT, _Size]: ... +@overload +def large_list(value_type: Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... +@overload +def large_list(value_type: _DataTypeT) -> LargeListType[_DataTypeT]: ... +@overload +def list_view(value_type: Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... +@overload +def list_view(value_type: _DataTypeT) -> ListViewType[_DataTypeT]: ... +@overload +def large_list_view(value_type: Field[_DataTypeT]) -> LargeListViewType[_DataTypeT]: ... +@overload +def large_list_view(value_type: _DataTypeT) -> LargeListViewType[_DataTypeT]: ... +@overload +def map_(key_type: _K, item_type: _IndexT) -> MapType[_K, _IndexT, Literal[False]]: ... +@overload +def map_( + key_type: _K, item_type: _IndexT, key_sorted: _Ordered +) -> MapType[_K, _IndexT, _Ordered]: ... +def dictionary( + index_type: _IndexT, value_type: _ValueT, ordered: _Ordered +) -> DictionaryType[_IndexT, _ValueT, _Ordered]: ... +def struct( + fields: Iterable[Field | tuple[str, Field]] | Mapping[str, Field], +) -> StructType: ... +def sparse_union( + child_fields: list[Field], type_codes: list[int] | None = None +) -> SparseUnionType: ... +def dense_union( + child_fields: list[Field], type_codes: list[int] | None = None +) -> DenseUnionType: ... +@overload +def union( + child_fields: list[Field], mode: Literal["sparse"], type_codes: list[int] | None = None +) -> SparseUnionType: ... +@overload +def union( + child_fields: list[Field], mode: Literal["dense"], type_codes: list[int] | None = None +) -> DenseUnionType: ... +def run_end_encoded( + run_end_type: _RunEndType, value_type: _ValueT +) -> RunEndEncodedType[_RunEndType, _ValueT]: ... +def fixed_shape_tensor( + value_type: _ValueT, + shape: tuple[list[int], ...], + dim_names: tuple[list[str], ...] | None = None, + permutation: tuple[list[int], ...] | None = None, +) -> FixedShapeTensorType[_ValueT]: ... +@overload +def type_for_alias(name: Literal["null"]) -> NullType: ... +@overload +def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def type_for_alias(name: Literal["u1", "uint8"]) -> Uint8Type: ... +@overload +def type_for_alias(name: Literal["u2", "uint16"]) -> Uint16Type: ... +@overload +def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def type_for_alias(name: Literal["u8", "uint64"]) -> Uint64Type: ... +@overload +def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def type_for_alias(name: Literal["binary"]) -> BinaryType: ... +@overload +def type_for_alias( + name: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... +@overload +def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +@overload +def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... +@overload +def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... +@overload +def ensure_type(ty: Literal["null"]) -> NullType: ... +@overload +def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def ensure_type(ty: Literal["u1", "uint8"]) -> Uint8Type: ... +@overload +def ensure_type(ty: Literal["u2", "uint16"]) -> Uint16Type: ... +@overload +def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def ensure_type(ty: Literal["u8", "uint64"]) -> Uint64Type: ... +@overload +def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def ensure_type(ty: Literal["binary"]) -> BinaryType: ... +@overload +def ensure_type( + ty: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... +@overload +def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def schema( + fields: Iterable[Field | tuple[str, Field]] | Mapping[str, Field], + metadata: dict[bytes, bytes] | None = None, +) -> Schema: ... +def from_numpy_dtype(dtype: np.dtype) -> DataType: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "Uint8Type", + "Int8Type", + "Uint16Type", + "Int16Type", + "Uint32Type", + "Int32Type", + "Uint64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "ensure_metadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "tzinfo_to_string", + "string_to_tzinfo", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "fixed_shape_tensor", + "type_for_alias", + "ensure_type", + "schema", + "from_numpy_dtype", + "is_boolean_value", + "is_integer_value", + "is_float_value", +] diff --git a/pyarrow-stubs/_azurefs.pyi b/pyarrow-stubs/_azurefs.pyi new file mode 100644 index 00000000000..acce68a29d1 --- /dev/null +++ b/pyarrow-stubs/_azurefs.pyi @@ -0,0 +1,14 @@ +from typing import Literal + +from ._fs import FileSystem + +class AzureFileSystem(FileSystem): + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + ) -> None: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index 0c957bea400..625cd6f05c4 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -1,669 +1,420 @@ -from typing import Any -from typing import Callable -from typing import ClassVar - -import pyarrow.lib - -from typing_extensions import Literal - -namedtuple: Callable - -class ArraySortOptions(_ArraySortOptions): - def __init__( +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + +class Kernel(lib._Weakrefable): ... + +class Function(lib._Weakrefable): + @property + def arity(self) -> int: ... + @property + def kind( self, - order: Literal["ascending", "descending"] = ..., - *, - null_placement: Literal["at_start", "at_end"] = ..., - ) -> None: ... - -class ArrowInvalid(ValueError, pyarrow.lib.ArrowException): ... + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... + @property + def name(self) -> str: ... + @property + def num_kernels(self) -> int: ... + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: ... -class AssumeTimezoneOptions(_AssumeTimezoneOptions): +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: ... + def list_functions(self) -> list[str]: ... + +class HashAggregateFunction(Function): ... +class HashAggregateKernel(Kernel): ... +class ScalarAggregateFunction(Function): ... +class ScalarAggregateKernel(Kernel): ... +class ScalarFunction(Function): ... +class ScalarKernel(Kernel): ... +class VectorFunction(Function): ... +class VectorKernel(Kernel): ... + +# ==================== _compute.pyx Option classes ==================== +class ArraySortOptions(FunctionOptions): def __init__( self, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = ..., - nonexistent: Literal["raise", "earliest", "latest"] = ..., + order: _Order = "ascending", + null_placement: _Placement = "at_end", ) -> None: ... -class CastOptions(_CastOptions): +class AssumeTimezoneOptions(FunctionOptions): def __init__( self, - target_type: pyarrow.lib.DataType | None = ..., + timezone: str, *, - allow_int_overflow: bool = ..., - allow_time_truncate: bool = ..., - allow_time_overflow: bool = ..., - allow_decimal_truncate: bool = ..., - allow_float_truncate: bool = ..., - allow_invalid_utf8: bool = ..., + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", ) -> None: ... - @staticmethod - def safe(target_type: pyarrow.lib.DataType | None = ...) -> CastOptions: ... - @staticmethod - def unsafe(target_type: pyarrow.lib.DataType | None = ...) -> CastOptions: ... - -class CountOptions(_CountOptions): - def __init__(self, mode: Literal["only_valid", "only_null", "all"] = ...) -> None: ... -class CumulativeSumOptions(_CumulativeSumOptions): - def __init__(self, start: float, *, skip_nulls: bool = ...) -> None: ... +class CastOptions(FunctionOptions): + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool -class DayOfWeekOptions(_DayOfWeekOptions): def __init__( self, + target_type: lib.DataType | None = None, *, - count_from_zero: bool = ..., - week_start: Literal[1, 2, 3, 4, 5, 6, 7] = ..., + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, ) -> None: ... - -class DictionaryEncodeOptions(_DictionaryEncodeOptions): - def __init__(self, null_encoding: Literal["mask", "encode"] = ...) -> None: ... - -class ElementWiseAggregateOptions(_ElementWiseAggregateOptions): - def __init__(self, *, skip_nulls: bool = ...) -> None: ... - -class Expression(pyarrow.lib._Weakrefable): - def __init__(self) -> None: ... - def _call(self, unicodefunction_name, listarguments, FunctionOptionsoptions=...) -> Any: ... @staticmethod - def _deserialize(buffer: pyarrow.lib.Buffer) -> Expression: ... + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... @staticmethod - def _field(name_or_idx: str | int) -> Expression: ... - @staticmethod - def _nested_field(self, names: list[str]) -> Expression: ... - def _scalar(self, value: pyarrow.lib.Scalar) -> Any: ... - def cast(self, type=..., safe=..., options=...) -> Any: ... - def equals(self, Expressionother) -> Any: ... - def is_null(self, boolnan_is_null=...) -> Any: ... - def is_valid(self) -> Any: ... - def isin(self, values) -> Any: ... - def __add__(self, other) -> Any: ... - def __and__(self, other) -> Any: ... - def __bool__(self) -> bool: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __invert__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __mul__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __or__(self, other) -> Any: ... - def __radd__(self, other) -> Any: ... - def __rand__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __rmul__(self, other) -> Any: ... - def __ror__(self, other) -> Any: ... - def __rsub__(self, other) -> Any: ... - def __rtruediv__(self, other) -> Any: ... - def __sub__(self, other) -> Any: ... - def __truediv__(self, other) -> Any: ... - -class ExtractRegexOptions(_ExtractRegexOptions): - def __init__(self, pattern) -> None: ... - -class FilterOptions(_FilterOptions): - def __init__(self, null_selection_behavior=...) -> None: ... - -class Function(pyarrow.lib._Weakrefable): - _kind_map: ClassVar[dict] = ... - _doc: Any - arity: Any - kind: Any - name: Any - num_kernels: Any - def __init__(self, *args, **kwargs) -> None: ... - def call( - self, args, FunctionOptionsoptions=..., MemoryPoolmemory_pool=..., length=... - ) -> Any: ... - def __reduce__(self) -> Any: ... - -class FunctionDoc(tuple): - _asdict: ClassVar[function] = ... - _field_defaults: ClassVar[dict] = ... - _fields: ClassVar[tuple] = ... - _replace: ClassVar[function] = ... - __getnewargs__: ClassVar[function] = ... - __match_args__: ClassVar[tuple] = ... - __slots__: ClassVar[tuple] = ... - arg_names: Any - description: Any - options_class: Any - options_required: Any - summary: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _make(cls, *args, **kwargs) -> Any: ... - -class FunctionOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - __slots__: ClassVar[tuple] = ... - def __init__(self, *args, **kwargs) -> None: ... - def deserialize(self, buf) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FunctionRegistry(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def get_function(self, name) -> Any: ... - def list_functions(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class HashAggregateFunction(Function): - kernels: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... -class HashAggregateKernel(Kernel): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class CountOptions(FunctionOptions): + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... -class IndexOptions(_IndexOptions): - def __init__(self, value) -> None: ... +class CumulativeOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... -class JoinOptions(_JoinOptions): - def __init__(self, null_handling=..., null_replacement=...) -> None: ... +class CumulativeSumOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... -class Kernel(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class DayOfWeekOptions(FunctionOptions): + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... -class MakeStructOptions(_MakeStructOptions): - def __init__(self, *args, **kwargs) -> None: ... +class DictionaryEncodeOptions(FunctionOptions): + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... -class MapLookupOptions(_MapLookupOptions): - def __init__(self, query_key, occurrence) -> None: ... +class RunEndEncodeOptions(FunctionOptions): + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... -class MatchSubstringOptions(_MatchSubstringOptions): - def __init__(self, *args, **kwargs) -> None: ... +class ElementWiseAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True) -> None: ... -class MetaFunction(Function): - kernels: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... +class ExtractRegexOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... -class ModeOptions(_ModeOptions): - def __init__(self, *args, **kwargs) -> None: ... +class FilterOptions(FunctionOptions): + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... -class NullOptions(_NullOptions): - def __init__(self, *args, **kwargs) -> None: ... +class IndexOptions(FunctionOptions): + def __init__(self, value: lib.Scalar) -> None: ... -class PadOptions(_PadOptions): - def __init__(self, width, padding=...) -> None: ... +class JoinOptions(FunctionOptions): + @overload + def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload + def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... -class PartitionNthOptions(_PartitionNthOptions): - def __init__(self, *args, **kwargs) -> None: ... +class ListSliceOptions(FunctionOptions): + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... -class QuantileOptions(_QuantileOptions): - def __init__(self, *args, **kwargs) -> None: ... +class ListFlattenOptions(FunctionOptions): + def __init__(self, recursive: bool = False) -> None: ... -class RandomOptions(_RandomOptions): - def __init__(self, *args, **kwargs) -> None: ... +class MakeStructOptions(FunctionOptions): + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... -class RankOptions(_RankOptions): - def __init__(self, *args, **kwargs) -> None: ... +class MapLookupOptions(FunctionOptions): + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... -class ReplaceSliceOptions(_ReplaceSliceOptions): - def __init__(self, start, stop, replacement) -> None: ... +class MatchSubstringOptions(FunctionOptions): + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... -class ReplaceSubstringOptions(_ReplaceSubstringOptions): - def __init__(self, *args, **kwargs) -> None: ... +class ModeOptions(FunctionOptions): + def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... -class RoundOptions(_RoundOptions): - def __init__(self, ndigits=..., round_mode=...) -> None: ... +class NullOptions(FunctionOptions): + def __init__(self, *, nan_is_null: bool = False) -> None: ... -class RoundTemporalOptions(_RoundTemporalOptions): - def __init__(self, *args, **kwargs) -> None: ... +class PadOptions(FunctionOptions): + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... -class RoundToMultipleOptions(_RoundToMultipleOptions): - def __init__(self, multiple=..., round_mode=...) -> None: ... +class PairwiseOptions(FunctionOptions): + def __init__(self, period: int = 1) -> None: ... -class ScalarAggregateFunction(Function): - kernels: Any - def __init__(self, *args, **kwargs) -> None: ... +class PartitionNthOptions(FunctionOptions): + def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... -class ScalarAggregateKernel(Kernel): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class QuantileOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... -class ScalarAggregateOptions(_ScalarAggregateOptions): - def __init__(self, *args, **kwargs) -> None: ... +class RandomOptions(FunctionOptions): + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... -class ScalarFunction(Function): - kernels: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... +class RankOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... -class ScalarKernel(Kernel): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class ReplaceSliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int, replacement: str) -> None: ... -class ScalarUdfContext(pyarrow.lib._Weakrefable): - batch_length: Any - memory_pool: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class ReplaceSubstringOptions(FunctionOptions): + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... -class SelectKOptions(_SelectKOptions): - def __init__(self, k, sort_keys) -> None: ... +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + +class RoundBinaryOptions(FunctionOptions): + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... -class SetLookupOptions(_SetLookupOptions): - def __init__(self, *args, **kwargs) -> None: ... +class RoundOptions(FunctionOptions): + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... -class SliceOptions(_SliceOptions): - def __init__(self, start, stop=..., step=...) -> None: ... +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + +class RoundTemporalOptions(FunctionOptions): + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... -class SortOptions(_SortOptions): - def __init__(self, *args, **kwargs) -> None: ... +class RoundToMultipleOptions(FunctionOptions): + def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... -class SplitOptions(_SplitOptions): - def __init__(self, *args, **kwargs) -> None: ... +class ScalarAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... -class SplitPatternOptions(_SplitPatternOptions): - def __init__(self, *args, **kwargs) -> None: ... +class SelectKOptions(FunctionOptions): + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... -class StrftimeOptions(_StrftimeOptions): - def __init__(self, format=..., locale=...) -> None: ... +class SetLookupOptions(FunctionOptions): + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... -class StrptimeOptions(_StrptimeOptions): - def __init__(self, format, unit, error_is_null=...) -> None: ... +class SliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... -class StructFieldOptions(_StructFieldOptions): - def __init__(self, indices) -> None: ... +class SortOptions(FunctionOptions): + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... -class TDigestOptions(_TDigestOptions): - def __init__(self, *args, **kwargs) -> None: ... +class SplitOptions(FunctionOptions): + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... -class TakeOptions(_TakeOptions): - def __init__(self, *args, **kwargs) -> None: ... +class SplitPatternOptions(FunctionOptions): + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... -class TrimOptions(_TrimOptions): - def __init__(self, characters) -> None: ... +class StrftimeOptions(FunctionOptions): + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... -class Utf8NormalizeOptions(_Utf8NormalizeOptions): - def __init__(self, form) -> None: ... +class StrptimeOptions(FunctionOptions): + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... -class VarianceOptions(_VarianceOptions): - def __init__(self, *args, **kwargs) -> None: ... +class StructFieldOptions(FunctionOptions): + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... -class VectorFunction(Function): - kernels: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... +class TakeOptions(FunctionOptions): + def __init__(self, boundscheck: bool = True) -> None: ... -class VectorKernel(Kernel): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class TDigestOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... -class WeekOptions(_WeekOptions): - def __init__(self, *args, **kwargs) -> None: ... +class TrimOptions(FunctionOptions): + def __init__(self, characters: str) -> None: ... -class _ArraySortOptions(FunctionOptions): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, order, null_placement) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _AssumeTimezoneOptions(FunctionOptions): - _ambiguous_map: ClassVar[dict] = ... - _nonexistent_map: ClassVar[dict] = ... - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, timezone, ambiguous, nonexistent) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _CastOptions(FunctionOptions): - allow_decimal_truncate: Any - allow_float_truncate: Any - allow_int_overflow: Any - allow_invalid_utf8: Any - allow_time_overflow: Any - allow_time_truncate: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options( - self, - DataTypetarget_type, - allow_int_overflow, - allow_time_truncate, - allow_time_overflow, - allow_decimal_truncate, - allow_float_truncate, - allow_invalid_utf8, - ) -> Any: ... - def _set_safe(self) -> Any: ... - def _set_type(self, target_type=...) -> Any: ... - def _set_unsafe(self) -> Any: ... - def is_safe(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _CountOptions(FunctionOptions): - _mode_map: ClassVar[dict] = ... - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, mode) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class Utf8NormalizeOptions(FunctionOptions): + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... -class _CumulativeSumOptions(FunctionOptions): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, start, skip_nulls) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class VarianceOptions(FunctionOptions): + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... -class _DayOfWeekOptions(FunctionOptions): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, count_from_zero, week_start) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _DictionaryEncodeOptions(FunctionOptions): - _null_encoding_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, null_encoding) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ElementWiseAggregateOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, skip_nulls) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ExtractRegexOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, pattern) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _FilterOptions(FunctionOptions): - _null_selection_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, null_selection_behavior) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _IndexOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, scalar) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _JoinOptions(FunctionOptions): - _null_handling_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, null_handling, null_replacement) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _MakeStructOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, field_names, field_nullability, field_metadata) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _MapLookupOptions(FunctionOptions): - _occurrence_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, query_key, occurrence) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _MatchSubstringOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, pattern, ignore_case) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ModeOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, n, skip_nulls, min_count) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _NullOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, nan_is_null) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _PadOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, width, padding) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _PartitionNthOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, pivot, null_placement) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _QuantileOptions(FunctionOptions): - _interp_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, quantiles, interp, skip_nulls, min_count) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RandomOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, initializer) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RankOptions(FunctionOptions): - _tiebreaker_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, sort_keys, null_placement, tiebreaker) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ReplaceSliceOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, start, stop, replacement) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ReplaceSubstringOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, pattern, replacement, max_replacements) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RoundOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, ndigits, round_mode) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RoundTemporalOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options( +class WeekOptions(FunctionOptions): + def __init__( self, - multiple, - unit, - week_starts_monday, - ceil_is_strictly_greater, - calendar_based_origin, - ) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RoundToMultipleOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, multiple, round_mode) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ScalarAggregateOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, skip_nulls, min_count) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SelectKOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, k, sort_keys) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SetLookupOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, value_set, boolskip_nulls) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SliceOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, start, stop, step) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SortOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, sort_keys, null_placement) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SplitOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, max_splits, reverse) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _SplitPatternOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, pattern, max_splits, reverse) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _StrftimeOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, format, locale) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _StrptimeOptions(FunctionOptions): - _unit_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, format, unit, error_is_null) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _StructFieldOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, indices) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _TDigestOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, quantiles, delta, buffer_size, skip_nulls, min_count) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _TakeOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, boundscheck) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _TrimOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, characters) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _Utf8NormalizeOptions(FunctionOptions): - _form_map: ClassVar[dict] = ... - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, form) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _VarianceOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options(self, ddof, skip_nulls, min_count) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _WeekOptions(FunctionOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_options( - self, week_starts_monday, count_from_zero, first_week_is_fully_in_year - ) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ordered_dict: - __hash__: ClassVar[None] = ... # type: ignore - def __init__(self, *args, **kwargs) -> None: ... - def clear(self, *args, **kwargs) -> Any: ... - def copy(self) -> dict: ... - @classmethod - def fromkeys(cls, *args, **kwargs) -> Any: ... - def get(self, *args, **kwargs) -> Any: ... - def items(self, *args, **kwargs) -> Any: ... - def keys(self, *args, **kwargs) -> Any: ... - def pop(self, *args, **kwargs) -> Any: ... - def popitem(self, *args, **kwargs) -> Any: ... - def setdefault(self, *args, **kwargs) -> Any: ... - def update(self, *args, **kwargs) -> Any: ... - def values(self, *args, **kwargs) -> Any: ... - @classmethod - def __class_getitem__(cls, *args, **kwargs) -> Any: ... - def __contains__(self, other) -> Any: ... - def __delitem__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getitem__(self, y) -> Any: ... - def __gt__(self, other) -> Any: ... - def __ior__(self, other) -> Any: ... - def __iter__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __len__(self) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __or__(self, other) -> Any: ... - def __reversed__(self) -> Any: ... - def __ror__(self, other) -> Any: ... - def __setitem__(self, index, object) -> Any: ... - def __sizeof__(self) -> Any: ... - -def __pyx_unpickle_Kernel(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def _deserialize(Bufferbuffer) -> Any: ... -def _get_scalar_udf_context(memory_pool, batch_length) -> Any: ... -def _group_by(args, keys, aggregations) -> Any: ... -def _min_count_doc(*args, **kwargs) -> Any: ... -def _raise_invalid_function_option(*args, **kwargs) -> Any: ... -def _skip_nulls_doc() -> Any: ... -def call_function(name, args, options=..., memory_pool=..., length=...) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def function_registry() -> Any: ... -def get_function(name) -> Any: ... -def list_functions() -> Any: ... -def register_scalar_function(func, function_name, function_doc, in_types, out_type) -> Any: ... -def tobytes(o) -> Any: ... + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: ... +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: ... +def list_functions() -> list[str]: ... + +# ==================== _compute.pyx Udf ==================== + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: ... + +class _FunctionDoc(TypedDict): + summary: str + description: str + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +): ... +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +): ... +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +): ... + +class UdfContext: + @property + def batch_length(self) -> int: ... + @property + def memory_pool(self) -> lib.MemoryPool: ... + +# ==================== _compute.pyx Expression ==================== +class Expression(lib._Weakrefable): + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... + def to_substrait( + self, schema: lib.Schema, allow_arrow_extensions: bool = False + ) -> lib.Buffer: ... + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: ... + def is_null(self, nan_is_null: bool = False) -> Expression: ... + def is_nan(self) -> Expression: ... + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: ... + def isin(self, values: lib.Array | Iterable) -> Expression: ... + +# ==================== _compute.py ==================== diff --git a/pyarrow-stubs/_compute_docstrings.pyi b/pyarrow-stubs/_compute_docstrings.pyi deleted file mode 100644 index 393ad543dc9..00000000000 --- a/pyarrow-stubs/_compute_docstrings.pyi +++ /dev/null @@ -1,7 +0,0 @@ -from typing_extensions import TypedDict - -class _FunctionDocAdditions(TypedDict): - filter: str - mode: str - -function_doc_additions: _FunctionDocAdditions diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 25faaa0ddf9..67fe4ba6567 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -1,172 +1,98 @@ -from typing import Any -from typing import ClassVar -from typing import overload +from dataclasses import dataclass, field +from pathlib import Path +from typing import IO, Callable, Literal -import pyarrow.lib +from . import lib -ISO8601: _ISO8601 -_stringify_path: function -namedtuple: function +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" -class CSVStreamingReader(pyarrow.lib.RecordBatchReader): - schema: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def validate(self) -> None: ... -class CSVWriter(pyarrow.lib._CRecordBatchWriter): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None -class ConvertOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - __slots__: ClassVar[tuple] = ... - auto_dict_encode: Any - auto_dict_max_cardinality: Any - check_utf8: Any - column_types: Any - decimal_point: Any - false_values: Any - include_columns: Any - include_missing_columns: Any - null_values: Any - quoted_strings_can_be_null: Any - strings_can_be_null: Any - timestamp_parsers: Any - true_values: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, ConvertOptionsother) -> Any: ... - def validate(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getstate__(self) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce_cython__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... + def validate(self) -> None: ... -class InvalidRow(_InvalidRow): - __slots__: ClassVar[tuple] = ... +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + check_utf8: bool = field(default=True, kw_only=False) + check_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None -class ParseOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - __slots__: ClassVar[tuple] = ... - delimiter: Any - double_quote: Any - escape_char: Any - ignore_empty_lines: Any - invalid_row_handler: Any - newlines_in_values: Any - quote_char: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, ParseOptionsother) -> Any: ... - def validate(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getstate__(self) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce_cython__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... + def validate(self) -> None: ... -class ReadOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - __slots__: ClassVar[tuple] = ... - autogenerate_column_names: Any - block_size: Any - column_names: Any - encoding: Any - skip_rows: Any - skip_rows_after_names: Any - use_threads: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, ReadOptionsother) -> Any: ... - def validate(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getstate__(self) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce_cython__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" -class SignalStopHandler: - stop_token: Any - def __init__(self, *args, **kwargs) -> None: ... - def _init_signals(self) -> Any: ... - def __enter__(self) -> Any: ... - def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def validate(self) -> None: ... -class WriteOptions(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - batch_size: Any - delimiter: Any - include_header: Any - def __init__(self, *args, **kwargs) -> None: ... - def validate(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +@dataclass +class InvalidRow(lib._Weakrefable): + expected_columns: int + actual_columns: int + number: int | None + text: str -class _ISO8601(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - __slots__: ClassVar[tuple] = ... - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class CSVWriter(lib._CRecordBatchWriter): + def __init__( + self, + # TODO: OutputStream + sink: str | Path | IO, + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... -class _InvalidRow(tuple): - _asdict: ClassVar[function] = ... - _field_defaults: ClassVar[dict] = ... - _fields: ClassVar[tuple] = ... - _replace: ClassVar[function] = ... - __getnewargs__: ClassVar[function] = ... - __match_args__: ClassVar[tuple] = ... - __slots__: ClassVar[tuple] = ... - actual_columns: Any - expected_columns: Any - number: Any - text: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _make(cls, *args, **kwargs) -> Any: ... +class CSVStreamingReader(lib.RecordBatchReader): ... + +ISO8601: lib._Weakrefable -def __pyx_unpickle__ISO8601(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... def open_csv( - input_file, - read_options=..., - parse_options=..., - convert_options=..., - MemoryPoolmemory_pool=..., -) -> Any: ... -@overload + input_file: str | Path | IO, + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: ... def read_csv( - input_file, - read_options=..., - parse_options=..., - convert_options=..., - MemoryPoolmemory_pool=..., -) -> Any: ... -@overload -def read_csv(source) -> Any: ... -def tobytes(o) -> Any: ... -def write_csv(data, output_file, write_options=..., MemoryPoolmemory_pool=...) -> Any: ... + input_file: str | Path | IO, + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: ... +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: str | Path | lib.NativeFile | IO, + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: ... diff --git a/pyarrow-stubs/_cuda.pyi b/pyarrow-stubs/_cuda.pyi new file mode 100644 index 00000000000..80a911b6f92 --- /dev/null +++ b/pyarrow-stubs/_cuda.pyi @@ -0,0 +1,94 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + +class Context(lib._Weakrefable): + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: ... + def to_numba(self) -> _numba_driver.Context: ... + @staticmethod + def get_num_devices() -> int: ... + @property + def device_number(self) -> int: ... + @property + def handle(self) -> int: ... + def synchronize(self) -> None: ... + @property + def bytes_allocated(self) -> int: ... + def get_device_address(self, address: int) -> int: ... + def new_buffer(self, nbytes: int) -> CudaBuffer: ... + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: ... + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: ... + def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + +class IpcMemHandle(lib._Weakrefable): + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... + +class CudaBuffer(lib.Buffer): + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... + def to_numba(self) -> _numba_driver.MemoryPointer: ... + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: ... + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: ... + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: ... + def export_for_ipc(self) -> IpcMemHandle: ... + @property + def context(self) -> Context: ... + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... + def to_pybytes(self) -> bytes: ... + +class HostBuffer(lib.Buffer): + @property + def size(self) -> int: ... + +class BufferReader(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... + +class BufferWriter(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: ... + @property + def buffer_size(self) -> int: ... + @buffer_size.setter + def buffer_size(self, buffer_size: int): ... + @property + def num_bytes_buffered(self) -> int: ... + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: ... +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: ... diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 4780a3457d7..d91a48d787b 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -1,360 +1,523 @@ -import importlib._bootstrap # type: ignore +from pathlib import Path +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + Self, + TypeVar, + overload, +) + +from . import _csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + +class Dataset(lib._Weakrefable): + @property + def partition_expression(self) -> Expression: ... + def replace_schema(self, schema: lib.Schema) -> None: ... + def get_fragments(self, filter: Expression | None = None): ... + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + def to_table( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + @property + def schema(self) -> lib.Schema: ... + def filter(self, expression: Expression) -> Self: ... + def sort_by(self, sorting: Order | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: ... + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: ... -from typing import Any -from typing import ClassVar -from typing import overload +class InMemoryDataset(Dataset): ... -import pyarrow.lib +class UnionDataset(Dataset): + @property + def children(self) -> list[Dataset]: ... -_DEFAULT_BATCH_READAHEAD: int -_DEFAULT_BATCH_SIZE: int -_DEFAULT_FRAGMENT_READAHEAD: int -_dataset_pq: bool -_is_iterable: function -_is_path_like: function -_orc_fileformat: None -_orc_imported: bool -_stringify_path: function +class FileSystemDataset(Dataset): + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: FileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: FileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: ... + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: ... + @property + def files(self) -> list[str]: ... + @property + def format(self) -> FileFormat: ... + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: str | Path | IO, filesystem: FileSystem | None = None + ) -> lib.Schema: ... + def make_fragment( + self, + file: str | Path | IO, + filesystem: FileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + +class Fragment(lib._Weakrefable): + @property + def physical_schema(self) -> lib.Schema: ... + @property + def partition_expression(self) -> Expression: ... + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + def to_table( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + +class FileFragment(Fragment): + def open(self) -> lib.NativeFile: ... + @property + def path(self) -> str: ... + @property + def filesystem(self) -> FileSystem: ... + @property + def buffer(self) -> lib.Buffer: ... + @property + def format(self) -> FileFormat: ... + +class FragmentScanOptions(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... -class ArrowTypeError(TypeError, pyarrow.lib.ArrowException): ... +class FeatherFileFormat(IpcFileFormat): ... class CsvFileFormat(FileFormat): - __slots__: ClassVar[tuple] = ... - _read_options_py: Any - parse_options: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, CsvFileFormatother) -> Any: ... - def make_write_options(self, **kwargs) -> Any: ... - def __reduce__(self) -> Any: ... + def __init__( + self, + parse_options: _csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: _csv.ConvertOptions | None = None, + read_options: _csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> _csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + convert_options: _csv.ConvertOptions + read_options: _csv.ReadOptions + + def __init__( + self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... class CsvFileWriteOptions(FileWriteOptions): - write_options: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + write_options: _csv.WriteOptions -class CsvFragmentScanOptions(FragmentScanOptions): - __slots__: ClassVar[tuple] = ... - convert_options: Any - read_options: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, CsvFragmentScanOptionsother) -> Any: ... - def __reduce__(self) -> Any: ... - -class Dataset(pyarrow.lib._Weakrefable): - partition_expression: Any - schema: Any - def __init__(self, *args, **kwargs) -> None: ... - def count_rows(self, **kwargs) -> Any: ... - def get_fragments(self, Expressionfilter=...) -> Any: ... - def head(self, intnum_rows, **kwargs) -> Any: ... - def join( +class JsonFileFormat(FileFormat): + def __init__( self, - right_dataset, - keys, - right_keys=..., - join_type=..., - left_suffix=..., - right_suffix=..., - coalesce_keys=..., - use_threads=..., - ) -> Any: ... - def replace_schema(self, Schemaschema) -> Any: ... - @overload - def scanner(self, **kwargs) -> Any: ... - @overload - def scanner(self, columns=...) -> Any: ... - @overload - def scanner(self, filter=...) -> Any: ... - def take(self, indices, **kwargs) -> Any: ... - def to_batches(self, **kwargs) -> Any: ... - def to_table(self, **kwargs) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class DatasetFactory(pyarrow.lib._Weakrefable): - root_partition: Any - def __init__(self, *args, **kwargs) -> None: ... - def finish(self, Schemaschema=...) -> Any: ... - def inspect(self) -> Any: ... - def inspect_schemas(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + +class JsonFragmentScanOptions(FragmentScanOptions): + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: ... + @property + def schema(self) -> lib.Schema: ... + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: ... class DirectoryPartitioning(KeyValuePartitioning): - def __init__(self, *args, **kwargs) -> None: ... + @staticmethod def discover( - self, - field_names=..., - infer_dictionary=..., - max_partition_dictionary_size=..., - schema=..., - segment_encoding=..., - ) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FeatherFileFormat(IpcFileFormat): - default_extname: Any - def __init__(self, *args, **kwargs) -> None: ... - -class FileFormat(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - default_extname: Any - default_fragment_scan_options: Any - def __init__(self, *args, **kwargs) -> None: ... - def inspect(self, file, filesystem=...) -> Any: ... - def make_fragment(self, file, filesystem=..., Expressionpartition_expression=...) -> Any: ... - def make_write_options(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileFragment(Fragment): - buffer: Any - filesystem: Any - format: Any - path: Any - def __init__(self, *args, **kwargs) -> None: ... - def open(self) -> Any: ... - def __reduce__(self) -> Any: ... - -class FileSystemDataset(Dataset): - files: Any - filesystem: Any - format: Any - partitioning: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def from_paths( - cls, - typecls, - paths, - schema=..., - format=..., - filesystem=..., - partitions=..., - root_partition=..., - ) -> Any: ... - def __reduce__(self) -> Any: ... + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... -class FileSystemDatasetFactory(DatasetFactory): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileSystemFactoryOptions(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - exclude_invalid_files: Any - partition_base_dir: Any - partitioning: Any - partitioning_factory: Any - selector_ignore_prefixes: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileWriteOptions(pyarrow.lib._Weakrefable): - format: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class HivePartitioning(KeyValuePartitioning): + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... class FilenamePartitioning(KeyValuePartitioning): - def __init__(self, *args, **kwargs) -> None: ... + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod def discover( - self, field_names=..., infer_dictionary=..., schema=..., segment_encoding=... - ) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Fragment(pyarrow.lib._Weakrefable): - partition_expression: Any - physical_schema: Any - def __init__(self, *args, **kwargs) -> None: ... - def count_rows(self, **kwargs) -> Any: ... - def head(self, intnum_rows, **kwargs) -> Any: ... - def scanner(self, Schemaschema=..., **kwargs) -> Any: ... - def take(self, indices, **kwargs) -> Any: ... - def to_batches(self, Schemaschema=..., **kwargs) -> Any: ... - def to_table(self, Schemaschema=..., **kwargs) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FragmentScanOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - type_name: Any - def __init__(self, *args, **kwargs) -> None: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + +class DatasetFactory(lib._Weakrefable): + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: ... + def inspect(self) -> lib.Schema: ... + def inspect_schemas(self) -> list[lib.Schema]: ... + +class FileSystemFactoryOptions(lib._Weakrefable): + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... -class HivePartitioning(KeyValuePartitioning): - def __init__(self, *args, **kwargs) -> None: ... - def discover( +class FileSystemDatasetFactory(DatasetFactory): + def __init__( self, - infer_dictionary=..., - max_partition_dictionary_size=..., - null_fallback=..., - schema=..., - segment_encoding=..., - ) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class InMemoryDataset(Dataset): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + filesystem: FileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... -class IpcFileFormat(FileFormat): - default_extname: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, IpcFileFormatother) -> Any: ... - def __reduce__(self) -> Any: ... +class UnionDatasetFactory(DatasetFactory): + def __init__(self, factories: list[DatasetFactory]) -> None: ... -class IpcFileWriteOptions(FileWriteOptions): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) -class KeyValuePartitioning(Partitioning): - dictionaries: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Partitioning(pyarrow.lib._Weakrefable): - schema: Any - def __init__(self, *args, **kwargs) -> None: ... - def parse(self, path) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class PartitioningFactory(pyarrow.lib._Weakrefable): - type_name: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class RecordBatchIterator(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def __iter__(self) -> Any: ... - def __next__(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Scanner(pyarrow.lib._Weakrefable): - dataset_schema: Any - projected_schema: Any - def __init__(self, *args, **kwargs) -> None: ... - def count_rows(self) -> Any: ... - def from_batches( - self, - source, - Schemaschema=..., - booluse_threads=..., - use_async=..., - MemoryPoolmemory_pool=..., - columns=..., - Expressionfilter=..., - intbatch_size=..., - FragmentScanOptionsfragment_scan_options=..., - ) -> Any: ... +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + +class TaggedRecordBatch(NamedTuple): + record_batch: lib.RecordBatch + fragment: Fragment + +class TaggedRecordBatchIterator(lib._Weakrefable): + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + +class Scanner(lib._Weakrefable): + @staticmethod def from_dataset( - self, - Datasetdataset, - booluse_threads=..., - use_async=..., - MemoryPoolmemory_pool=..., - columns=..., - Expressionfilter=..., - intbatch_size=..., - intbatch_readahead=..., - intfragment_readahead=..., - FragmentScanOptionsfragment_scan_options=..., - ) -> Any: ... + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @staticmethod def from_fragment( - self, - Fragmentfragment, - Schemaschema=..., - booluse_threads=..., - use_async=..., - MemoryPoolmemory_pool=..., - columns=..., - Expressionfilter=..., - intbatch_size=..., - intbatch_readahead=..., - FragmentScanOptionsfragment_scan_options=..., - ) -> Any: ... - def head(self, intnum_rows) -> Any: ... - def scan_batches(self) -> Any: ... - def take(self, indices) -> Any: ... - def to_batches(self) -> Any: ... - def to_reader(self) -> Any: ... - def to_table(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class TaggedRecordBatch(importlib._bootstrap.TaggedRecordBatch): ... - -class TaggedRecordBatchIterator(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def __iter__(self) -> Any: ... - def __next__(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class UnionDataset(Dataset): - children: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @overload + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch], + *, + schema: lib.Schema, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @overload + @staticmethod + def from_batches( + source: RecordBatchReader, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @property + def dataset_schema(self) -> lib.Schema: ... + @property + def projected_schema(self) -> lib.Schema: ... + def to_batches(self) -> Iterator[lib.RecordBatch]: ... + def scan_batches(self) -> TaggedRecordBatchIterator: ... + def to_table(self) -> lib.Table: ... + def take(self, indices: Indices) -> lib.Table: ... + def head(self, num_rows: int) -> lib.Table: ... + def count_rows(self) -> int: ... + def to_reader(self) -> RecordBatchReader: ... + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + +class WrittenFile(lib._Weakrefable): + def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... -class UnionDatasetFactory(DatasetFactory): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class WrittenFile(pyarrow.lib._Weakrefable): - metadata: Any - path: Any - size: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def __pyx_unpickle_WrittenFile(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... def _filesystemdataset_write( - Scannerdata, - base_dir, - unicodebasename_template, - FileSystemfilesystem, - Partitioningpartitioning, - FileWriteOptionsfile_options, - intmax_partitions, - file_visitor, - unicodeexisting_data_behavior, - intmax_open_files, - intmax_rows_per_file, - intmin_rows_per_group, - intmax_rows_per_group, - boolcreate_dir, -) -> Any: ... -def _forbid_instantiation(klass, subclasses_instead=...) -> Any: ... -def _get_orc_fileformat() -> Any: ... -def _get_parquet_classes() -> Any: ... -def _get_parquet_symbol(name) -> Any: ... -def _get_partition_keys(Expressionpartition_expression) -> Any: ... -def _pc() -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... + data: Scanner, + base_dir: str | Path, + basename_template: str, + filesystem: FileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + +class ScanNodeOptions(_ScanNodeOptions): + def __init__(self, dataset: Dataset, **kwargs) -> None: ... diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi index a22e63d6208..9c4ac04198f 100644 --- a/pyarrow-stubs/_dataset_orc.pyi +++ b/pyarrow-stubs/_dataset_orc.pyi @@ -1,9 +1,6 @@ -from typing import Any +from ._dataset import FileFormat -import pyarrow._dataset - -class OrcFileFormat(pyarrow._dataset.FileFormat): - default_extname: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, OrcFileFormatother) -> Any: ... - def __reduce__(self) -> Any: ... +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index 481bad265b6..ce1a8403476 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -1,96 +1,124 @@ -from typing import Any -from typing import ClassVar +from dataclasses import dataclass +from pathlib import Path +from typing import IO, Any, Iterable, TypedDict -import pyarrow._dataset -import pyarrow.lib +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import FileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable -_is_path_like: function -_stringify_path: function +parquet_encryption_enabled: bool -class ParquetDatasetFactory(pyarrow._dataset.DatasetFactory): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ParquetFactoryOptions(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - partition_base_dir: Any - partitioning: Any - partitioning_factory: Any - validate_column_chunk_paths: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ParquetFileFormat(pyarrow._dataset.FileFormat): - default_extname: Any - read_options: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, ParquetFileFormatother) -> Any: ... +class ParquetFileFormat(FileFormat): + def __init__( + self, + read_options: ParquetReadOptions, + default_fragment_scan_options: ParquetFragmentScanOptions, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... def make_fragment( - self, file, filesystem=..., Expressionpartition_expression=..., row_groups=... - ) -> Any: ... - def make_write_options(self, **kwargs) -> Any: ... - def __reduce__(self) -> Any: ... + self, + file: IO | Path | str, + filesystem: FileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... -class ParquetFileFragment(pyarrow._dataset.FileFragment): - metadata: Any - num_row_groups: Any - row_groups: Any - def __init__(self, *args, **kwargs) -> None: ... - def ensure_complete_metadata(self) -> Any: ... - def split_by_row_group(self, Expressionfilter=..., Schemaschema=...) -> Any: ... - def subset(self, Expressionfilter=..., Schemaschema=..., row_group_ids=...) -> Any: ... - def __reduce__(self) -> Any: ... +class _NameStats(TypedDict): + min: Any + max: Any -class ParquetFileWriteOptions(pyarrow._dataset.FileWriteOptions): - def __init__(self, *args, **kwargs) -> None: ... - def _set_arrow_properties(self) -> Any: ... - def _set_properties(self) -> Any: ... - def update(self, **kwargs) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ParquetFragmentScanOptions(pyarrow._dataset.FragmentScanOptions): - __slots__: ClassVar[tuple] = ... - buffer_size: Any - pre_buffer: Any - thrift_container_size_limit: Any - thrift_string_size_limit: Any - use_buffered_stream: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _reconstruct(cls, typecls, kwargs) -> Any: ... - def equals(self, ParquetFragmentScanOptionsother) -> Any: ... - def __reduce__(self) -> Any: ... +class RowGroupInfo: + id: int + metadata: FileMetaData + schema: Schema -class ParquetReadOptions(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - _coerce_int96_timestamp_unit: Any - coerce_int96_timestamp_unit: Any - dictionary_columns: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, ParquetReadOptionsother) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... -class RowGroupInfo: - __hash__: ClassVar[None] = ... # type: ignore - def __init__(self, id, metadata, schema) -> None: ... - def __eq__(self, other) -> Any: ... +class ParquetFileFragment(FileFragment): + def ensure_complete_metadata(self) -> None: ... @property - def num_rows(self) -> Any: ... + def row_groups(self) -> list[RowGroupInfo]: ... @property - def statistics(self) -> Any: ... + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: ... + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: ... + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: ... + +class ParquetReadOptions(_Weakrefable): + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... @property - def total_byte_size(self) -> Any: ... + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False -def __pyx_unpickle_ParquetReadOptions(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class ParquetDatasetFactory(DatasetFactory): + def __init__( + self, + metadata_path: str, + filesystem: FileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/pyarrow-stubs/_dataset_parquet_encryption.pyi b/pyarrow-stubs/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..2072333daf1 --- /dev/null +++ b/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -0,0 +1,33 @@ +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + +class ParquetEncryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +class ParquetDecryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/pyarrow-stubs/_exec_plan.pyi b/pyarrow-stubs/_exec_plan.pyi deleted file mode 100644 index f20551c1ecb..00000000000 --- a/pyarrow-stubs/_exec_plan.pyi +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Any - -import pyarrow._dataset - -class InMemoryDataset(pyarrow._dataset.Dataset): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def _filter_table(table, expression, output_type=...) -> Any: ... -def _perform_join( - join_type, - left_operand, - left_keys, - right_operand, - right_keys, - left_suffix=..., - right_suffix=..., - use_threads=..., - coalesce_keys=..., - output_type=..., -) -> Any: ... -def tobytes(o) -> Any: ... diff --git a/pyarrow-stubs/_feather.pyi b/pyarrow-stubs/_feather.pyi index 17f697dac72..4abc96b55ae 100644 --- a/pyarrow-stubs/_feather.pyi +++ b/pyarrow-stubs/_feather.pyi @@ -1,20 +1,28 @@ -from typing import Any +from pathlib import Path +from typing import IO -import pyarrow.lib +from .lib import Buffer, NativeFile, Table, _Weakrefable class FeatherError(Exception): ... -class FeatherReader(pyarrow.lib._Weakrefable): - version: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def read(self) -> Any: ... - def read_indices(self, indices) -> Any: ... - def read_names(self, names) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def tobytes(o) -> Any: ... def write_feather( - Tabletable, dest, compression=..., compression_level=..., chunksize=..., version=... -) -> Any: ... + table: Table, + dest: str | IO | Path | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: str | IO | Path | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index f2e4c20944e..ed5b38b619d 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -1,587 +1,435 @@ +import asyncio import enum -import importlib._bootstrap # type: ignore -import re -from typing import Any -from typing import ClassVar +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, Self, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: ... + +class CertKeyPair(NamedTuple): + cert: str + key: str -import pyarrow.lib +class FlightError(Exception): + extra_info: str + +class FlightInternalError(FlightError, ArrowException): ... +class FlightTimedOutError(FlightError, ArrowException): ... +class FlightCancelledError(FlightError, ArrowCancelled): ... +class FlightServerError(FlightError, ArrowException): ... +class FlightUnauthenticatedError(FlightError, ArrowException): ... +class FlightUnauthorizedError(FlightError, ArrowException): ... +class FlightUnavailableError(FlightError, ArrowException): ... + +class FlightWriteSizeExceededError(ArrowInvalid): + limit: int + actual: int + +class Action(_Weakrefable): + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: ... + @property + def type(self) -> str: ... + @property + def body(self) -> Buffer: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... -from pyarrow.lib import Schema +class ActionType(NamedTuple): + type: str + description: str -_FLIGHT_SERVER_ERROR_REGEX: re.Pattern -_get_legacy_format_default: function + def make_action(self, buf: Buffer | bytes) -> Action: ... -class Action(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - body: Any - type: Any - def __init__(self, *args, **kwargs) -> None: ... +class Result(_Weakrefable): + def __init__(self, buf: Buffer | bytes) -> None: ... + @property + def body(self) -> Buffer: ... + def serialize(self) -> bytes: ... @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ActionType(importlib._bootstrap._ActionType): - def make_action(self, buf) -> Any: ... - -class ArrowCancelled(pyarrow.lib.ArrowException): - def __init__(self, message, signum=...) -> None: ... - -class ArrowException(Exception): ... -class ArrowInvalid(ValueError, pyarrow.lib.ArrowException): ... - -class BasicAuth(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - password: Any - username: Any - def __init__(self, *args, **kwargs) -> None: ... - def deserialize(self, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class CallInfo(importlib._bootstrap._CallInfo): ... -class CertKeyPair(importlib._bootstrap._CertKeyPair): ... - -class ClientAuthHandler(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def authenticate(self, outgoing, incoming) -> Any: ... - def get_token(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ClientAuthReader(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def read(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ClientAuthSender(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def write(self, message) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ClientMiddleware(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def call_completed(self, exception) -> Any: ... - def received_headers(self, headers) -> Any: ... - def sending_headers(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ClientMiddlewareFactory(pyarrow.lib._Weakrefable): - def start_call(self, info) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def deserialize(cls, serialized: bytes) -> Self: ... + +class BasicAuth(_Weakrefable): + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: ... + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... class DescriptorType(enum.Enum): - CMD: ClassVar[DescriptorType] = ... - PATH: ClassVar[DescriptorType] = ... - UNKNOWN: ClassVar[DescriptorType] = ... - -class FlightCallOptions(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightCancelledError(FlightError, pyarrow.lib.ArrowCancelled): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightClient(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def authenticate( - self, auth_handler, FlightCallOptionsoptions: FlightCallOptions = ... - ) -> Any: ... - def authenticate_basic_token( - self, username, password, FlightCallOptionsoptions: FlightCallOptions = ... - ) -> Any: ... - def close(self) -> Any: ... + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + @staticmethod + def for_path(*path: str | bytes): ... + @staticmethod + def for_command(command: str | bytes): ... + @property + def descriptor_type(self) -> DescriptorType: ... + @property + def path(self) -> list[bytes] | None: ... + @property + def command(self) -> bytes | None: ... + def serialize(self) -> bytes: ... @classmethod - def connect( - cls, - typecls, - location, - tls_root_certs=..., - cert_chain=..., - private_key=..., - override_hostname=..., - disable_server_verification=..., - ) -> Any: ... - def do_action(self, action, FlightCallOptionsoptions: FlightCallOptions = ...) -> Any: ... - def do_exchange( - self, - FlightDescriptordescriptor: FlightDescriptor, - FlightCallOptionsoptions: FlightCallOptions = ..., - ) -> Any: ... - def do_get( - self, Ticketticket: Ticket, FlightCallOptionsoptions: FlightCallOptions = ... - ) -> Any: ... - def do_put( - self, - FlightDescriptordescriptor: FlightDescriptor, - Schemaschema: Schema, - FlightCallOptionsoptions: FlightCallOptions = ..., - ) -> Any: ... - def get_flight_info( - self, - FlightDescriptordescriptor: FlightDescriptor, - FlightCallOptionsoptions: FlightCallOptions = ..., - ) -> Any: ... - def get_schema( - self, - FlightDescriptordescriptor: FlightDescriptor, - FlightCallOptionsoptions: FlightCallOptions = ..., - ) -> Any: ... - def list_actions(self, FlightCallOptionsoptions: FlightCallOptions = ...) -> Any: ... - def list_flights( - self, - bytescriteria: bytes = ..., - FlightCallOptionsoptions: FlightCallOptions = ..., - ) -> Any: ... - def wait_for_available(self, timeout=...) -> Any: ... - def __del__(self) -> Any: ... - def __enter__(self) -> Any: ... - def __exit__(self, exc_type, exc_value, traceback) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightDataStream(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightDescriptor(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - command: bytes | None - descriptor_type: DescriptorType - path: list[bytes] | None - def __init__(self, *args, **kwargs) -> None: ... + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... @classmethod - def deserialize(cls, serialized: str | bytes) -> FlightDescriptor: ... + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... @staticmethod - def for_command(command: str | bytes) -> FlightDescriptor: ... + def for_grpc_tls(host: str | bytes, port: int) -> Location: ... @staticmethod - def for_path(*path: str | bytes) -> FlightDescriptor: ... + def for_grpc_unix(path: str | bytes) -> Location: ... + +class FlightEndpoint(_Weakrefable): + def __init__(self, ticket: Ticket | str | bytes, locations: list[str]): ... + @property + def ticket(self) -> Ticket: ... + @property + def locations(self) -> list[Location]: ... def serialize(self) -> bytes: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightEndpoint(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - locations: Any - ticket: Any - def __init__(self, *args, **kwargs) -> None: ... @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def deserialize(cls, serialized: bytes) -> Self: ... -class FlightError(Exception): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightInfo(pyarrow.lib._Weakrefable): - descriptor: Any - endpoints: Any - schema: Any - total_bytes: Any - total_records: Any - def __init__(self, *args, **kwargs) -> None: ... +class SchemaResult(_Weakrefable): + def __init__(self, schema: Schema) -> None: ... + @property + def schema(self) -> Schema: ... + def serialize(self) -> bytes: ... @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def deserialize(cls, serialized: bytes) -> Self: ... -class FlightInternalError(FlightError, pyarrow.lib.ArrowException): +class FlightInfo(_Weakrefable): + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int, + total_bytes: int, + ) -> None: ... + @property + def schema(self) -> Schema: ... + @property + def descriptor(self) -> FlightDescriptor: ... + @property + def endpoints(self) -> list[FlightEndpoint]: ... + @property + def total_records(self) -> int: ... + @property + def total_bytes(self) -> int: ... + def serialize(self) -> bytes: ... @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightMetadataReader(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def read(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightMetadataWriter(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def write(self, message) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: ... + def read_all(self) -> Table: ... + def read_chunk(self) -> FlightStreamChunk: ... + def to_reader(self) -> RecordBatchReader: ... + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): ... -class FlightMethod(enum.Enum): - DO_ACTION: ClassVar[FlightMethod] = ... - DO_EXCHANGE: ClassVar[FlightMethod] = ... - DO_GET: ClassVar[FlightMethod] = ... - DO_PUT: ClassVar[FlightMethod] = ... - GET_FLIGHT_INFO: ClassVar[FlightMethod] = ... - GET_SCHEMA: ClassVar[FlightMethod] = ... - HANDSHAKE: ClassVar[FlightMethod] = ... - INVALID: ClassVar[FlightMethod] = ... - LIST_ACTIONS: ClassVar[FlightMethod] = ... - LIST_FLIGHTS: ClassVar[FlightMethod] = ... - -class FlightServerBase(pyarrow.lib._Weakrefable): - port: Any - def __init__(self, *args, **kwargs) -> None: ... - def do_action(self, context, action) -> Any: ... - def do_exchange(self, context, descriptor, reader, writer) -> Any: ... - def do_get(self, context, ticket) -> Any: ... +class FlightStreamReader(MetadataRecordBatchReader): + def cancel(self): ... + def read_all(self) -> Table: ... + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... + def write_metadata(self, buf: Buffer) -> None: ... + def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: ... + def close(self) -> None: ... + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: ... + +class FlightStreamWriter(MetadataRecordBatchWriter): + def done_writing(self) -> None: ... + +class FlightMetadataReader(_Weakrefable): + def read(self) -> Buffer | None: ... + +class FlightMetadataWriter(_Weakrefable): + def write(self, message: Buffer) -> None: ... + +class AsyncioCall(Generic[_T]): + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: ... + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: ... + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: ... + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: ... + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: ... + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: ... + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: ... + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: ... + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: ... + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: ... def do_put( self, - context, - descriptor, - MetadataRecordBatchReaderreader: MetadataRecordBatchReader, - FlightMetadataWriterwriter: FlightMetadataWriter, - ) -> Any: ... - def get_flight_info(self, context, descriptor) -> Any: ... - def get_schema(self, context, descriptor) -> Any: ... - def list_actions(self, context) -> Any: ... - def list_flights(self, context, criteria) -> Any: ... - def run(self) -> Any: ... - def serve(self) -> Any: ... - def shutdown(self) -> Any: ... - def wait(self) -> Any: ... - def __enter__(self) -> Any: ... - def __exit__(self, exc_type, exc_value, traceback) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightServerError(FlightError, pyarrow.lib.ArrowException): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightStreamChunk(pyarrow.lib._Weakrefable): - app_metadata: Any - data: Any - - def __init__(self, *args, **kwargs) -> None: ... - def __iter__(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + def close(self) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... -class FlightStreamReader(MetadataRecordBatchReader): - def __init__(self, *args, **kwargs) -> None: ... - def cancel(self) -> Any: ... - def read_all(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class FlightDataStream(_Weakrefable): ... -class FlightStreamWriter(MetadataRecordBatchWriter): - def __init__(self, *args, **kwargs) -> None: ... - def done_writing(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FlightTimedOutError(FlightError, pyarrow.lib.ArrowException): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightUnauthenticatedError(FlightError, pyarrow.lib.ArrowException): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightUnauthorizedError(FlightError, pyarrow.lib.ArrowException): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightUnavailableError(FlightError, pyarrow.lib.ArrowException): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce_cython__(self) -> Any: ... - def __setstate_cython__(self, __pyx_state) -> Any: ... - -class FlightWriteSizeExceededError(pyarrow.lib.ArrowInvalid): - def __init__(self, message, limit, actual) -> None: ... +class RecordBatchStream(FlightDataStream): + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: ... class GeneratorStream(FlightDataStream): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Location(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - uri: Any - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, Locationother) -> Any: ... - def for_grpc_tcp(self, host, port) -> Any: ... - def for_grpc_tls(self, host, port) -> Any: ... - def for_grpc_unix(self, path) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class MetadataRecordBatchReader(_MetadataRecordBatchReader): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class MetadataRecordBatchWriter(pyarrow.lib._CRecordBatchWriter): - def __init__(self, *args, **kwargs) -> None: ... - def begin(self, schema: Schema, options=...) -> Any: ... - def close(self) -> Any: ... - def write_metadata(self, buf) -> Any: ... - def write_with_metadata(self, RecordBatchbatch, buf) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: ... -class RecordBatchStream(FlightDataStream): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Result(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - body: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class SchemaResult(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - schema: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerAuthHandler(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def authenticate(self, outgoing, incoming) -> Any: ... - def is_valid(self, token) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerAuthReader(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def read(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerAuthSender(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def write(self, message) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerCallContext(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def get_middleware(self, key) -> Any: ... - def is_cancelled(self) -> Any: ... - def peer(self) -> Any: ... - def peer_identity(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerMiddleware(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def call_completed(self, exception) -> Any: ... - def sending_headers(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ServerMiddlewareFactory(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def start_call(self, info, headers) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class SignalStopHandler: - stop_token: Any - def __init__(self, *args, **kwargs) -> None: ... - def _init_signals(self) -> Any: ... - def __enter__(self) -> Any: ... - def __exit__(self, exc_type, exc_value, exc_tb) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class Ticket(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - ticket: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def deserialize(cls, typecls, serialized) -> Any: ... - def serialize(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class ServerCallContext(_Weakrefable): + def peer_identity(self) -> bytes: ... + def peer(self) -> str: ... + def is_cancelled(self) -> bool: ... + def add_header(self, key: str, value: str) -> None: ... + def add_trailer(self, key: str, value: str) -> None: ... + def get_middleware(self, key: str) -> ServerMiddleware | None: ... -class TracingServerMiddleware(ServerMiddleware): - __slots__: ClassVar[list] = ... - trace_context: Any - def __init__(self, trace_context) -> None: ... - -class TracingServerMiddlewareFactory(ServerMiddlewareFactory): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _ActionType(tuple): - _asdict: ClassVar[function] = ... - _field_defaults: ClassVar[dict] = ... - _fields: ClassVar[tuple] = ... - _replace: ClassVar[function] = ... - __getnewargs__: ClassVar[function] = ... - __match_args__: ClassVar[tuple] = ... - __slots__: ClassVar[tuple] = ... - description: Any - type: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _make(cls, *args, **kwargs) -> Any: ... - -class _CallInfo(tuple): - _asdict: ClassVar[function] = ... - _field_defaults: ClassVar[dict] = ... - _fields: ClassVar[tuple] = ... - _replace: ClassVar[function] = ... - __getnewargs__: ClassVar[function] = ... - __match_args__: ClassVar[tuple] = ... - __slots__: ClassVar[tuple] = ... - method: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _make(cls, *args, **kwargs) -> Any: ... - -class _CertKeyPair(tuple): - _asdict: ClassVar[function] = ... - _field_defaults: ClassVar[dict] = ... - _fields: ClassVar[tuple] = ... - _replace: ClassVar[function] = ... - __getnewargs__: ClassVar[function] = ... - __match_args__: ClassVar[tuple] = ... - __slots__: ClassVar[tuple] = ... - cert: Any - key: Any - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _make(cls, *args, **kwargs) -> Any: ... +class ServerAuthReader(_Weakrefable): + def read(self) -> str: ... -class _FlightServerFinalizer(pyarrow.lib._Weakrefable): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def finalize(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class ServerAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... -class _MetadataRecordBatchReader(pyarrow.lib._Weakrefable, pyarrow.lib._ReadPandasMixin): - schema: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def read_all(self) -> Any: ... - def read_chunk(self) -> Any: ... - def to_reader(self) -> Any: ... - def __iter__(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class ClientAuthReader(_Weakrefable): + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... + def is_valid(self, token: str) -> bool: ... + +class ClientAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... + def get_token(self) -> str: ... + +class CallInfo(NamedTuple): + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + def start_call(self, info: CallInfo) -> ClientMiddleware: ... + +class ClientMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... + def call_completed(self, exception: ArrowException): ... + +class ServerMiddlewareFactory(_Weakrefable): + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware: ... -class _ReadPandasMixin: - def read_pandas(self, **options) -> Any: ... +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): ... + +class ServerMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + def call_completed(self, exception: ArrowException): ... + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): - def __init__(self, *args, **kwargs) -> None: ... - def start_call(self, info, headers) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... class _ServerMiddlewareWrapper(ServerMiddleware): - def __init__(self, *args, **kwargs) -> None: ... - def call_completed(self, exception) -> Any: ... - def sending_headers(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def __pyx_unpickle_ClientAuthHandler(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_ClientMiddleware(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_ClientMiddlewareFactory(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightCancelledError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightDataStream(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightInternalError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightServerError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightTimedOutError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightUnauthenticatedError( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle_FlightUnauthorizedError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_FlightUnavailableError(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_ServerAuthHandler(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_ServerMiddleware(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_ServerMiddlewareFactory(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle_TracingServerMiddlewareFactory( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle__ServerMiddlewareFactoryWrapper( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def __pyx_unpickle__ServerMiddlewareWrapper( - __pyx_type, long__pyx_checksum, __pyx_state -) -> Any: ... -def _munge_grpc_python_error(message) -> Any: ... -def as_buffer(o) -> Any: ... -def connect(location, **kwargs) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: ... + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: ... + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: ... + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: ... + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: ... + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: ... + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: ... + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: ... + def serve(self) -> None: ... + def run(self) -> None: ... + def shutdown(self) -> None: ... + def wait(self) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 9f165e4121c..3f87e3fe40a 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -1,226 +1,134 @@ -import _abc # type: ignore -import abc -import datetime +import datetime as dt import enum -import importlib._bootstrap # type: ignore -from typing import Any -from typing import Callable -from typing import ClassVar - -import pyarrow.lib - -Directory: importlib._bootstrap.FileType -File: importlib._bootstrap.FileType -NotFound: importlib._bootstrap.FileType -Unknown: importlib._bootstrap.FileType -_stringify_path: function -abstractmethod: function - -class ABC: - _abc_impl: ClassVar[_abc._abc_data] = ... - __abstractmethods__: ClassVar[frozenset] = ... - __slots__: ClassVar[tuple] = ... - -class FileInfo(pyarrow.lib._Weakrefable): - base_name: Any - extension: Any - is_file: Any - mtime: Any - mtime_ns: Any - path: Any - size: Any - type: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileSelector(pyarrow.lib._Weakrefable): - allow_not_found: Any - base_dir: Any - recursive: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileSystem(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - type_name: Any - def __init__(self, *args, **kwargs) -> None: ... - def _wrap_input_stream(self, stream, path, compression, buffer_size) -> Any: ... - def _wrap_output_stream(self, stream, path, compression, buffer_size) -> Any: ... - def copy_file(self, src, dest) -> Any: ... - def create_dir(self, *args, **kwargs) -> Any: ... - def delete_dir(self, path) -> Any: ... - def delete_dir_contents(self, *args, **kwargs) -> Any: ... - def delete_file(self, path) -> Any: ... - def equals(self, FileSystemother) -> Any: ... +from abc import ABC, abstractmethod +from typing import Self, overload + +from .lib import NativeFile, _Weakrefable + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + +class FileInfo(_Weakrefable): + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: ... + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: ... + @property + def base_name(self) -> str: ... + @property + def size(self) -> int: ... + @property + def extension(self) -> str: ... + @property + def mtime(self) -> dt.datetime | None: ... + @property + def mtime_ns(self) -> int | None: ... + +class FileSelector(_Weakrefable): + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + +class FileSystem(_Weakrefable): @classmethod - def from_uri(cls, uri) -> FileSystem: ... - def get_file_info(self, paths_or_selector) -> Any: ... - def move(self, src, dest) -> Any: ... - def normalize_path(self, path) -> Any: ... - def open_append_stream(self, path, compression=..., buffer_size=..., metadata=...) -> Any: ... - def open_input_file(self, path) -> Any: ... - def open_input_stream(self, path, compression=..., buffer_size=...) -> Any: ... - def open_output_stream(self, path, compression=..., buffer_size=..., metadata=...) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileSystemHandler(abc.ABC): - def copy_file(self, src: str, dest: str) -> None: ... - def create_dir(self, path: str, recursive: bool) -> None: ... + def from_uri(cls, uri: str) -> tuple[Self, str]: ... + def equals(self, other: FileSystem) -> bool: ... + @property + def type_name(self) -> str: ... + @overload + def get_file_info(self, paths_or_selector: str) -> FileInfo: ... + @overload + def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... + def create_dir(self, path: str, *, recursive: bool = True) -> None: ... def delete_dir(self, path: str) -> None: ... - def delete_dir_contents(self, path: str, missing_dir_ok: bool = ...) -> None: ... + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: ... + def move(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: ... def delete_file(self, path: str) -> None: ... - def delete_root_dir_contents(self) -> None: ... - def get_file_info(self, paths: list[str]) -> list[FileInfo]: ... - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... - def get_type_name(self) -> Any: ... - def move(self, src, dest) -> Any: ... - def normalize_path(self, path) -> Any: ... - def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ... - def open_input_file(self, path: str) -> Any: ... - def open_input_stream(self, path: str) -> Any: ... - def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ... - -class FileType(enum.IntEnum): - class _member_type_: - denominator: Any - imag: Any - numerator: Any - real: Any - def __init__(self, *args, **kwargs) -> None: ... - def as_integer_ratio(self) -> Any: ... - def bit_count(self) -> Any: ... - def bit_length(self) -> Any: ... - def conjugate(self, *args, **kwargs) -> Any: ... - @classmethod - def from_bytes(cls, *args, **kwargs) -> Any: ... - def to_bytes(self, *args, **kwargs) -> Any: ... - def __abs__(self) -> Any: ... - def __add__(self, other) -> Any: ... - def __and__(self, other) -> Any: ... - def __bool__(self) -> Any: ... - def __ceil__(self, *args, **kwargs) -> Any: ... - def __divmod__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __float__(self) -> Any: ... - def __floor__(self, *args, **kwargs) -> Any: ... - def __floordiv__(self, other) -> Any: ... - def __format__(self, *args, **kwargs) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getnewargs__(self, *args, **kwargs) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __index__(self) -> Any: ... - def __int__(self) -> Any: ... - def __invert__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lshift__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __mod__(self, other) -> Any: ... - def __mul__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __neg__(self) -> Any: ... - def __or__(self, other) -> Any: ... - def __pos__(self) -> Any: ... - def __pow__(self, other) -> Any: ... - def __radd__(self, other) -> Any: ... - def __rand__(self, other) -> Any: ... - def __rdivmod__(self, other) -> Any: ... - def __rfloordiv__(self, other) -> Any: ... - def __rlshift__(self, other) -> Any: ... - def __rmod__(self, other) -> Any: ... - def __rmul__(self, other) -> Any: ... - def __ror__(self, other) -> Any: ... - def __round__(self) -> Any: ... - def __rpow__(self, other) -> Any: ... - def __rrshift__(self, other) -> Any: ... - def __rshift__(self, other) -> Any: ... - def __rsub__(self, other) -> Any: ... - def __rtruediv__(self, other) -> Any: ... - def __rxor__(self, other) -> Any: ... - def __sizeof__(self) -> Any: ... - def __sub__(self, other) -> Any: ... - def __truediv__(self, other) -> Any: ... - def __trunc__(self) -> Any: ... - def __xor__(self, other) -> Any: ... - - __new__: ClassVar[Callable] = ... - Directory: ClassVar[importlib._bootstrap.FileType] = ... - File: ClassVar[importlib._bootstrap.FileType] = ... - NotFound: ClassVar[importlib._bootstrap.FileType] = ... - Unknown: ClassVar[importlib._bootstrap.FileType] = ... - _generate_next_value_: ClassVar[Callable] = ... - _member_map_: ClassVar[dict] = ... - _member_names_: ClassVar[list] = ... - _value2member_map_: ClassVar[dict] = ... + def open_input_file(self, path: str) -> NativeFile: ... + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: ... + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, list[str]] | None = None, + ) -> NativeFile: ... + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, list[str]] | None = None, + ): ... + def normalize_path(self, path: str) -> str: ... class LocalFileSystem(FileSystem): - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _reconstruct(cls, typecls, kwargs) -> Any: ... - def __reduce__(self) -> Any: ... - -class PyFileSystem(FileSystem): - handler: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... + def __init__(self, *, use_mmap: bool = False) -> None: ... class SubTreeFileSystem(FileSystem): - base_fs: Any - base_path: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... class _MockFileSystem(FileSystem): - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class timezone(datetime.tzinfo): - max: ClassVar[datetime.timezone] = ... - min: ClassVar[datetime.timezone] = ... - utc: ClassVar[datetime.timezone] = ... - def __init__(self, *args, **kwargs) -> None: ... - def dst(self, *args, **kwargs) -> Any: ... - def fromutc(self, *args, **kwargs) -> Any: ... - def tzname(self, *args, **kwargs) -> Any: ... - def utcoffset(self, *args, **kwargs) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getinitargs__(self) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... + def __init__(self, current_time: dt.datetime | None = None) -> None: ... -def __pyx_unpickle___Pyx_EnumMeta(*args, **kwargs) -> Any: ... -def _copy_files( - FileSystemsource_fs, - unicodesource_path, - FileSystemdestination_fs, - unicodedestination_path, - int64_tchunk_size, - booluse_threads, -) -> Any: ... -def _copy_files_selector( - FileSystemsource_fs, - FileSelectorsource_sel, - FileSystemdestination_fs, - unicodedestination_base_dir, - int64_tchunk_size, - booluse_threads, -) -> Any: ... -def _detect_compression(path) -> Any: ... -def _file_type_to_string(ty) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class PyFileSystem(FileSystem): + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: ... + +class FileSystemHandler(ABC): + @abstractmethod + def get_type_name(self) -> str: ... + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: ... + @abstractmethod + def delete_dir(self, path: str) -> None: ... + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + @abstractmethod + def delete_root_dir_contents(self) -> None: ... + @abstractmethod + def delete_file(self, path: str) -> None: ... + @abstractmethod + def move(self, src: str, dest: str) -> None: ... + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: ... + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: ... + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: ... + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, list[str]]) -> NativeFile: ... + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, list[str]]) -> NativeFile: ... + @abstractmethod + def normalize_path(self, path: str) -> str: ... diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi index 682f7e59c9b..f94370c51c1 100644 --- a/pyarrow-stubs/_gcsfs.pyi +++ b/pyarrow-stubs/_gcsfs.pyi @@ -1,97 +1,24 @@ -import collections.abc -import datetime +import datetime as dt -from typing import Any -from typing import ClassVar +from ._fs import FileSystem +from .lib import KeyValueMetadata -import pyarrow._fs -import pyarrow.lib - -class GcsFileSystem(pyarrow._fs.FileSystem): - default_bucket_location: Any - def __init__(self, *args, **kwargs) -> None: ... - def _expiration_datetime_from_options(self) -> Any: ... - @classmethod - def _reconstruct(cls, typecls, kwargs) -> Any: ... - def __reduce__(self) -> Any: ... - -class KeyValueMetadata(pyarrow.lib._Metadata, collections.abc.Mapping): - __hash__: ClassVar[None] = ... # type: ignore - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, KeyValueMetadataother) -> Any: ... - def get_all(self, key) -> Any: ... - def items(self) -> Any: ... - def key(self, i) -> Any: ... - def keys(self) -> Any: ... - def to_dict(self) -> Any: ... - def value(self, i) -> Any: ... - def values(self) -> Any: ... - def __contains__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getitem__(self, index) -> Any: ... - def __gt__(self, other) -> Any: ... - def __iter__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __len__(self) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - -class timedelta: - max: ClassVar[datetime.timedelta] = ... - min: ClassVar[datetime.timedelta] = ... - resolution: ClassVar[datetime.timedelta] = ... - days: Any - microseconds: Any - seconds: Any - def __init__(self, *args, **kwargs) -> None: ... - def total_seconds(self, *args, **kwargs) -> Any: ... - def __abs__(self) -> Any: ... - def __add__(self, other) -> Any: ... - def __bool__(self) -> Any: ... - def __divmod__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __floordiv__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __mod__(self, other) -> Any: ... - def __mul__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __neg__(self) -> Any: ... - def __pos__(self) -> Any: ... - def __radd__(self, other) -> Any: ... - def __rdivmod__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __rfloordiv__(self, other) -> Any: ... - def __rmod__(self, other) -> Any: ... - def __rmul__(self, other) -> Any: ... - def __rsub__(self, other) -> Any: ... - def __rtruediv__(self, other) -> Any: ... - def __sub__(self, other) -> Any: ... - def __truediv__(self, other) -> Any: ... - -class timezone(datetime.tzinfo): - max: ClassVar[datetime.timezone] = ... - min: ClassVar[datetime.timezone] = ... - utc: ClassVar[datetime.timezone] = ... - def __init__(self, *args, **kwargs) -> None: ... - def dst(self, *args, **kwargs) -> Any: ... - def fromutc(self, *args, **kwargs) -> Any: ... - def tzname(self, *args, **kwargs) -> Any: ... - def utcoffset(self, *args, **kwargs) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getinitargs__(self) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - -def ensure_metadata(meta, boolallow_none=...) -> KeyValueMetadata: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class GcsFileSystem(FileSystem): + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: ... + @property + def project_id(self) -> str: ... diff --git a/pyarrow-stubs/_generated_version.pyi b/pyarrow-stubs/_generated_version.pyi deleted file mode 100644 index 2e057ab7ef4..00000000000 --- a/pyarrow-stubs/_generated_version.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from _typeshed import Incomplete - -version: str -__version_tuple__: Incomplete -version_tuple: Incomplete diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index 584285128eb..97af569908a 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -1,17 +1,19 @@ -from typing import Any -from typing import Callable +from pathlib import Path -import pyarrow._fs +from ._fs import FileSystem -_stringify_path: Callable - -class HadoopFileSystem(pyarrow._fs.FileSystem): - def __init__(self, *args, **kwargs) -> None: ... - @classmethod - def _reconstruct(cls, typecls, kwargs) -> Any: ... - @classmethod - def from_uri(cls, uri) -> HadoopFileSystem: ... - def __reduce__(self) -> Any: ... - -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class HadoopFileSystem(FileSystem): + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: str | Path | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: ... # type: ignore[override] diff --git a/pyarrow-stubs/_hdfsio.pyi b/pyarrow-stubs/_hdfsio.pyi deleted file mode 100644 index 367c727bf34..00000000000 --- a/pyarrow-stubs/_hdfsio.pyi +++ /dev/null @@ -1,69 +0,0 @@ -import re - -from typing import Any -from typing import overload - -import pyarrow.lib - -from typing_extensions import Literal - -_HDFS_PATH_RE: re.Pattern - -class HadoopFileSystem(pyarrow.lib._Weakrefable): - extra_conf: dict - host: Any - is_open: bool - kerb_ticket: Any - port: int - user: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def chmod(self, path: str, mode: int) -> Any: ... - def chown(self, path: str, owner: str = ..., group: str = ...) -> Any: ... - def close(self) -> Any: ... - @classmethod - def connect(cls, *args, **kwargs) -> Any: ... - def delete(self, path: str, recursive: bool = ...) -> Any: ... - def df(self) -> int: ... - def download(self, path: str, stream, buffer_size: int | None = ...) -> Any: ... - def exists(self, path: str) -> bool: ... - def get_capacity(self) -> int: ... - def get_space_used(self) -> int: ... - def info(self, path: str) -> dict: ... - def isdir(self, path: str) -> bool: ... - def isfile(self, path: str) -> bool: ... - @overload - def ls(self, path: str, full_info: Literal[True]) -> list[dict]: ... - @overload - def ls(self, path: str, full_info: Literal[False]) -> list[str]: ... - def mkdir(self, path: str) -> None: ... - def open( - self, - path: str, - mode: Literal["rb", "wb", "ab"] = ..., - buffer_size: int | None = ..., - replication: int | None = ..., - default_block_size: int | None = ..., - ) -> HdfsFile: ... - def rename(self, path: str, new_path: str) -> None: ... - def stat(self, path: str) -> dict[str, Any]: ... - def upload(self, path: str, stream, buffer_size: int | None = ...) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class HdfsFile(pyarrow.lib.NativeFile): - buffer_size: int - parent: _HdfsFileNanny | None - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _HdfsFileNanny(pyarrow.lib._Weakrefable): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def have_libhdfs() -> bool: ... -def strip_hdfs_abspath(path: str) -> str: ... diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index 9a11b8f3f15..418f0ed8595 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -1,23 +1,29 @@ -from typing import Any -from typing import ClassVar +from pathlib import Path +from typing import IO, Literal -import pyarrow.lib +from .lib import MemoryPool, Schema, Table, _Weakrefable -class ParseOptions(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - explicit_schema: Any - newlines_in_values: Any - unexpected_field_behavior: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... +class ReadOptions(_Weakrefable): + use_threads: bool + block_size: int + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: ... -class ReadOptions(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - block_size: Any - use_threads: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... +class ParseOptions(_Weakrefable): + explicit_schema: Schema + newlines_in_values: bool + unexpected_field_behavior: Literal["ignore", "error", "infer"] + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: ... def read_json( - input_file, read_options=..., parse_options=..., MemoryPoolmemory_pool=... -) -> Any: ... + input_file: str | Path | IO, + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: ... diff --git a/pyarrow-stubs/_orc.pyi b/pyarrow-stubs/_orc.pyi index 13c41bdb1bd..71bf0dde9ba 100644 --- a/pyarrow-stubs/_orc.pyi +++ b/pyarrow-stubs/_orc.pyi @@ -1,44 +1,56 @@ -from typing import Any +from typing import IO, Literal -import pyarrow.lib +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) -_stringify_path: function +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... -class ORCReader(pyarrow.lib._Weakrefable): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def compression(self) -> Any: ... - def compression_size(self) -> Any: ... - def content_length(self) -> Any: ... - def file_footer_length(self) -> Any: ... - def file_length(self) -> Any: ... - def file_postscript_length(self) -> Any: ... - def file_version(self) -> Any: ... - def metadata(self) -> Any: ... - def nrows(self) -> Any: ... - def nstripe_statistics(self) -> Any: ... - def nstripes(self) -> Any: ... - def open(self, source, booluse_memory_map=...) -> Any: ... - def read(self, columns=...) -> Any: ... - def read_stripe(self, n, columns=...) -> Any: ... - def row_index_stride(self) -> Any: ... - def schema(self) -> Any: ... - def serialized_file_tail(self) -> Any: ... - def software_version(self) -> Any: ... - def stripe_statistics_length(self) -> Any: ... - def writer(self) -> Any: ... - def writer_version(self) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class ORCWriter(pyarrow.lib._Weakrefable): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def close(self) -> Any: ... - def open(self, *args, **kwargs) -> Any: ... - def write(self, Tabletable) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index d1568982746..7b7796dca12 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -1,27 +1,166 @@ -from typing import Any -from typing import ClassVar -from typing import Generator +from pathlib import Path +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict -import pyarrow.lib +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) -from typing_extensions import Literal +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] -_stringify_path: function -indent: function +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType -class ArrowException(Exception): ... +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def hash_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... -class ColumnChunkMetaData(pyarrow.lib._Weakrefable): - def __init__(self) -> None: ... +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... def equals(self, other: ColumnChunkMetaData) -> bool: ... - def to_dict(self) -> Any: ... - def __eq__(self, other) -> bool: ... @property def file_offset(self) -> int: ... @property def file_path(self) -> str | None: ... @property - def physical_type(self) -> str: ... + def physical_type(self) -> _PhysicalType: ... @property def num_values(self) -> int: ... @property @@ -29,25 +168,11 @@ class ColumnChunkMetaData(pyarrow.lib._Weakrefable): @property def is_stats_set(self) -> bool: ... @property - def statistics(self) -> Statistics: ... + def statistics(self) -> Statistics | None: ... @property - def compression( - self, - ) -> Literal["UNCOMPRESSED", "SNAPPY", "GZIP", "LZO", "BROTLI", "LZ4", "ZSTD", "UNKNOWN"]: ... + def compression(self) -> _Compression: ... @property - def encodings( - self, - ) -> tuple[ - Literal[ - "PLAIN", - "BIT_PACKED", - "RLE", - "BYTE_STREAM_SPLIT", - "DELTA_BINARY_PACKED", - "DELTA_BYTE_ARRAY", - ], - ..., - ]: ... + def encodings(self) -> tuple[_Encoding, ...]: ... @property def has_dictionary_page(self) -> bool: ... @property @@ -62,11 +187,108 @@ class ColumnChunkMetaData(pyarrow.lib._Weakrefable): def total_compressed_size(self) -> int: ... @property def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] -class ColumnSchema(pyarrow.lib._Weakrefable): +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: str | Path | Buffer | NativeFile | IO) -> None: ... + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + +class ColumnSchema(_Weakrefable): def __init__(self, schema: ParquetSchema, index: int) -> None: ... - def equals(self, other: ColumnSchema) -> Any: ... - def __eq__(self, other) -> Any: ... + def equals(self, other: ColumnSchema) -> bool: ... @property def name(self) -> str: ... @property @@ -76,11 +298,11 @@ class ColumnSchema(pyarrow.lib._Weakrefable): @property def max_repetition_level(self) -> int: ... @property - def physical_type(self) -> str: ... + def physical_type(self) -> _PhysicalType: ... @property def logical_type(self) -> ParquetLogicalType: ... @property - def converted_type(self) -> str | None: ... + def converted_type(self) -> _ConvertedType | None: ... @property def length(self) -> int | None: ... @property @@ -88,195 +310,133 @@ class ColumnSchema(pyarrow.lib._Weakrefable): @property def scale(self) -> int | None: ... -class ParquetLogicalType(pyarrow.lib._Weakrefable): - type: Any - def to_json(self) -> str: ... - -class ParquetReader(pyarrow.lib._Weakrefable): - _column_idx_map: dict[bytes, int] | None - closed: bool - column_paths: Any - metadata: FileMetaData | None - num_row_groups: int - schema_arrow: pyarrow.lib.Schema - @classmethod - def __init__(self, memory_pool: pyarrow.lib.MemoryPool) -> None: ... - def close(self) -> None: ... - def column_name_idx(self, column_name: str) -> int: ... +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( + self, + source: str | Path | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... def iter_batches( self, batch_size: int, row_groups: list[int], - column_indices: list[int] | None = ..., - use_threads: bool = ..., - ) -> Generator[pyarrow.lib.RecordBatch, None, None]: ... - def open( - self, - source, - *, - use_memory_map: bool = ..., - read_dictionary: list[str | int] | None = ..., - metadata: FileMetaData = ..., - buffer_size: int = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: str | None = ..., - decryption_properties: FileDecryptionProperties = ..., - thrift_string_size_limit: int = ..., - thrift_container_size_limit: int = ..., - ) -> pyarrow.lib.Table: ... - def read_all( - self, column_indices: list[int] | None = ..., use_threads: bool = ... - ) -> pyarrow.lib.Table: ... - def read_column(self, column_index: int) -> pyarrow.lib.Array: ... + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... def read_row_group( - self, i: int, column_indices: list[int] | None = ..., use_threads: bool = ... - ) -> pyarrow.lib.Table: ... + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... def read_row_groups( self, row_groups: list[int], - column_indices: list[int] | None = ..., - use_threads: bool = ..., - ) -> pyarrow.lib.Table: ... - def scan_contents( - self, column_indices: list[int] | None = ..., batch_size: int = ... - ) -> int: ... - def set_batch_size(self, batch_size: int) -> None: ... - def set_use_threads(self, use_threads: bool) -> None: ... - -class ParquetSchema(pyarrow.lib._Weakrefable): - names: list[str] - def __init__(self, container: FileMetaData) -> None: ... - def column(self, i: int) -> ColumnSchema: ... - def equals(self, other: ParquetSchema) -> bool: ... - def to_arrow_schema(self) -> pyarrow.lib.Schema: ... - def __eq__(self, other) -> bool: ... - def __getitem__(self, i: int) -> ColumnSchema: ... - def __len__(self) -> int: ... + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... -class ParquetWriter(pyarrow.lib._Weakrefable): - allow_truncated_timestamps: Any - coerce_timestamps: Any - column_encoding: Any - compression: str | dict[str, str] - compression_level: Any - data_page_size: Any - data_page_version: Any - dictionary_pagesize_limit: Any - encryption_properties: Any - metadata: FileMetaData - row_group_size: Any - use_byte_stream_split: Any - use_compliant_nested_type: Any - use_deprecated_int96_timestamps: Any - use_dictionary: bool | list[str] - version: Any - write_batch_size: Any - write_statistics: Any - writer_engine_version: Any +class ParquetWriter(_Weakrefable): def __init__( - cls, - where, - schema: pyarrow.lib.Schema, - use_dictionary: bool | list[str] | None = ..., - compression: str | dict[str, str] = ..., - version: str | None = ..., - write_statistics: bool | list[str] | None = ..., - memory_pool: pyarrow.lib.MemoryPool = ..., - use_deprecated_int96_timestamps: bool = ..., - coerce_timestamps: Literal["ms", "us"] | None = ..., - data_page_size: int | None = ..., - allow_truncated_timestamps: bool = ..., - compression_level: int | dict[str, int] | None = ..., - use_byte_stream_split: bool | list[str] = ..., - column_encoding: str | dict[str, str] | None = ..., - writer_engine_version: Literal["V1", "V2"] | None = ..., - data_page_version: Literal["1.0", "2.0"] | None = ..., - use_compliant_nested_type: bool = ..., - encryption_properties: FileDecryptionProperties | None = ..., - write_batch_size: int | None = ..., - dictionary_pagesize_limit: int | None = ..., - ) -> None: ... + self, + where: str | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... def close(self) -> None: ... - def write_table(self, table: pyarrow.lib.Table, row_group_size: int | None = ...) -> None: ... - -class RowGroupMetaData(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - num_columns: Any - num_rows: Any - total_byte_size: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def column(self, inti) -> Any: ... - def equals(self, RowGroupMetaDataother) -> Any: ... - def to_dict(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - -class Statistics(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - converted_type: Any - distinct_count: Any - has_distinct_count: Any - has_min_max: Any - has_null_count: Any - logical_type: Any - max: Any - max_raw: Any - min: Any - min_raw: Any - null_count: Any - num_values: Any - physical_type: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def equals(self, Statisticsother) -> Any: ... - def to_dict(self) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class FileDecryptionProperties: - def __init__(self, *args, **kwargs) -> None: ... - -class FileEncryptionProperties: - def __init__(self, *args, **kwargs) -> None: ... - -class FileMetaData(pyarrow.lib._Weakrefable): - __hash__: ClassVar[None] = ... # type: ignore - created_by: Any - format_version: Any - metadata: Any - num_columns: Any - num_row_groups: Any - num_rows: Any - schema: Any - serialized_size: Any - def __init__(self, *args, **kwargs) -> None: ... - def append_row_groups(self, FileMetaDataother) -> Any: ... - def equals(self, FileMetaDataother) -> Any: ... - def row_group(self, inti) -> Any: ... - def set_file_path(self, path) -> Any: ... - def to_dict(self) -> Any: ... - def write_metadata_file(self, where) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... -def _datetime_from_int(int64_tvalue, TimeUnitunit, tzinfo=...) -> Any: ... -def _reconstruct_filemetadata(Bufferserialized) -> Any: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class FileEncryptionProperties: ... +class FileDecryptionProperties: ... diff --git a/pyarrow-stubs/_parquet_encryption.pyi b/pyarrow-stubs/_parquet_encryption.pyi index df0688c792d..c707edb844a 100644 --- a/pyarrow-stubs/_parquet_encryption.pyi +++ b/pyarrow-stubs/_parquet_encryption.pyi @@ -1,105 +1,67 @@ -import datetime +import datetime as dt -from typing import Any -from typing import ClassVar +from typing import Callable -import pyarrow.lib +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable -class ArrowException(Exception): ... +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int -class CryptoFactory(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - def __init__(self, *args, **kwargs) -> None: ... - def file_decryption_properties( - self, - KmsConnectionConfigkms_connection_config, - DecryptionConfigurationdecryption_config=..., - ) -> Any: ... - def file_encryption_properties( + def __init__( self, - KmsConnectionConfigkms_connection_config, - EncryptionConfigurationencryption_config, - ) -> Any: ... - def remove_cache_entries_for_all_tokens(self) -> Any: ... - def remove_cache_entries_for_token(self, access_token) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class DecryptionConfiguration(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - cache_lifetime: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class EncryptionConfiguration(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - cache_lifetime: Any - column_keys: Any - data_key_length_bits: Any - double_wrapping: Any - encryption_algorithm: Any - footer_key: Any - internal_key_material: Any - plaintext_footer: Any - def __init__(self, *args, **kwargs) -> None: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... -class KmsClient(pyarrow.lib._Weakrefable): - def __init__(self, *args, **kwargs) -> None: ... - def unwrap_key(self, wrapped_key, master_key_identifier) -> Any: ... - def wrap_key(self, key_bytes, master_key_identifier) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... -class KmsConnectionConfig(pyarrow.lib._Weakrefable): - __slots__: ClassVar[tuple] = ... - custom_kms_conf: Any - key_access_token: Any - kms_instance_id: Any - kms_instance_url: Any - def __init__(self, *args, **kwargs) -> None: ... - def refresh_key_access_token(self, value) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... -class timedelta: - max: ClassVar[datetime.timedelta] = ... - min: ClassVar[datetime.timedelta] = ... - resolution: ClassVar[datetime.timedelta] = ... - days: Any - microseconds: Any - seconds: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def total_seconds(self, *args, **kwargs) -> Any: ... - def __abs__(self) -> Any: ... - def __add__(self, other) -> Any: ... - def __bool__(self) -> Any: ... - def __divmod__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __floordiv__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __gt__(self, other) -> Any: ... - def __hash__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __lt__(self, other) -> Any: ... - def __mod__(self, other) -> Any: ... - def __mul__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __neg__(self) -> Any: ... - def __pos__(self) -> Any: ... - def __radd__(self, other) -> Any: ... - def __rdivmod__(self, other) -> Any: ... - def __reduce__(self) -> Any: ... - def __rfloordiv__(self, other) -> Any: ... - def __rmod__(self, other) -> Any: ... - def __rmul__(self, other) -> Any: ... - def __rsub__(self, other) -> Any: ... - def __rtruediv__(self, other) -> Any: ... - def __sub__(self, other) -> Any: ... - def __truediv__(self, other) -> Any: ... +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... -def frombytes(*args, **kwargs) -> Any: ... -def tobytes(o) -> Any: ... +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/pyarrow-stubs/_plasma.pyi b/pyarrow-stubs/_plasma.pyi deleted file mode 100644 index f85704e3b4a..00000000000 --- a/pyarrow-stubs/_plasma.pyi +++ /dev/null @@ -1,105 +0,0 @@ -import socket - -from typing import Any -from typing import overload - -import pyarrow.lib - -from typing_extensions import Literal -from typing_extensions import TypedDict - -PLASMA_WAIT_TIMEOUT: int - -class _ListResult(TypedDict): - data_size: int - metadata_size: int - ref_count: int - create_time: float - construct_duration: int - state: Literal["created", "sealed"] - -class ArrowException(Exception): ... - -class ObjectID(pyarrow.lib._Weakrefable): - def __init__(self, object_id: bytes) -> None: ... - def binary(self) -> bytes: ... - @staticmethod - def from_random() -> ObjectID: ... - def __eq__(self, other) -> bool: ... - def __hash__(self) -> int: ... - -class ObjectNotAvailable(pyarrow.lib._Weakrefable): ... -class PlasmaBuffer(pyarrow.lib.Buffer): ... - -class PlasmaClient(pyarrow.lib._Weakrefable): - store_socket_name: str - def __init__(self) -> None: ... - def _release(self, object_id: ObjectID) -> None: ... - def contains(self, object_id: ObjectID) -> bool: ... - def create( - self, object_id: ObjectID, data_size: int, metadata: bytes = ... - ) -> pyarrow.lib.Buffer: ... - def create_and_seal(self, object_id: ObjectID, data: bytes, metadata: bytes = ...) -> None: ... - def debug_string(self) -> str: ... - def decode_notifications(self, buf: pyarrow.lib.Buffer) -> tuple[list[ObjectID], int, int]: ... - def delete(self, object_ids: list[ObjectID]) -> None: ... - def disconnect(self) -> None: ... - def evict(self, num_bytes: int) -> None: ... - @overload - def get( - self, - object_ids: ObjectID, - timeout_ms: int = ..., - serialization_context: pyarrow.lib.SerializationContext = ..., - ) -> Any: ... - @overload - def get( - self, - object_ids: list[ObjectID], - timeout_ms: int = ..., - serialization_context: pyarrow.lib.SerializationContext = ..., - ) -> list[Any]: ... - def get_buffers( - self, - object_ids: list[ObjectID], - timeout_ms: int = ..., - with_meta: bool = ..., - ) -> list[PlasmaBuffer | None | tuple[PlasmaBuffer | None, bytes]]: ... - def get_metadata( - self, object_ids: list[ObjectID], timeout_ms: int = ... - ) -> list[PlasmaBuffer | None]: ... - def get_next_notification(self) -> list[tuple[ObjectID, int, int]]: ... - def get_notification_socket(self) -> socket.socket: ... - def hash(self, object_id: ObjectID) -> bytes: ... - def list(self) -> _ListResult: ... - def put( - self, - value: Any, - object_id: ObjectID | None = ..., - memcopy_threads: int = ..., - serialization_context: pyarrow.lib.SerializationContext = ..., - ) -> ObjectID: ... - def put_raw_buffer( - self, - value: memoryview, - object_id: ObjectID | None = ..., - metadata: bytes = ..., - memcopy_threads: int = ..., - ) -> ObjectID: ... - def seal(self, object_id: ObjectID) -> None: ... - def set_client_options(self, client_name: str, limit_output_memory: int) -> None: ... - def store_capacity(self) -> int: ... - def subscribe(self) -> None: ... - def to_capsule(self) -> Any: ... - -class PlasmaObjectExists(pyarrow.lib.ArrowException): ... -class PlasmaObjectNotFound(pyarrow.lib.ArrowException): ... -class PlasmaStoreFull(pyarrow.lib.ArrowException): ... - -def connect(store_socket_name: str, num_retries: int = ...) -> PlasmaClient: ... -def get_socket_from_fd( - fileno: int | None, - family: socket.AddressFamily | int, - type: socket.SocketKind | int, -) -> socket.socket: ... -def make_object_id(object_id: bytes) -> ObjectID: ... diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi index 66daeb5b4f9..fc13c498bd9 100644 --- a/pyarrow-stubs/_s3fs.pyi +++ b/pyarrow-stubs/_s3fs.pyi @@ -1,63 +1,74 @@ import enum -import importlib._bootstrap # type: ignore -from typing import Any -from typing import ClassVar +from typing import Literal, NotRequired, Required, TypedDict -import pyarrow._fs -import pyarrow.lib +from ._fs import FileSystem +from .lib import KeyValueMetadata -Debug: importlib._bootstrap.S3LogLevel -Error: importlib._bootstrap.S3LogLevel -Fatal: importlib._bootstrap.S3LogLevel -Info: importlib._bootstrap.S3LogLevel -Off: importlib._bootstrap.S3LogLevel -Trace: importlib._bootstrap.S3LogLevel -Warn: importlib._bootstrap.S3LogLevel - -class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... -class AwsStandardS3RetryStrategy(S3RetryStrategy): ... - -class S3FileSystem(pyarrow._fs.FileSystem): - region: str - def __init__( - self, - *, - access_key: str | None = ..., - secret_key: str | None = ..., - session_token: str | None = ..., - anonymous: bool = ..., - role_arn: str | None = ..., - session_name: str | None = ..., - external_id: str | None = ..., - load_frequency: int = ..., - region: str = ..., - request_timeout: float | None = ..., - connect_timeout: float | None = ..., - scheme: str = ..., - endpoint_override: str | None = ..., - background_writes: bool = ..., - default_metadata: dict | pyarrow.lib.KeyValueMetadata = ..., - proxy_options: dict | str | None = ..., - allow_bucket_creation: bool = ..., - allow_bucket_deletion: bool = ..., - retry_strategy: S3RetryStrategy = ..., - ) -> None: ... - @classmethod - def _reconstruct(cls, kwargs: Any) -> S3FileSystem: ... +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] class S3LogLevel(enum.IntEnum): - Debug: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Error: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Fatal: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Info: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Off: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Trace: ClassVar[importlib._bootstrap.S3LogLevel] = ... - Warn: ClassVar[importlib._bootstrap.S3LogLevel] = ... + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() -class S3RetryStrategy: - def __init__(self, max_attempts: int = ...) -> None: ... +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... def finalize_s3() -> None: ... -def initialize_s3(log_level: S3LogLevel = ...) -> Any: ... +def ensure_s3_finalized() -> None: ... def resolve_s3_region(bucket: str) -> str: ... + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi new file mode 100644 index 00000000000..f2f28a77494 --- /dev/null +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -0,0 +1,68 @@ +from typing import Any, Collection, Literal, Protocol, TypeAlias + +import numpy as np + +from numpy.typing import NDArray + +from .__lib_pxi.array import BooleanArray, IntegerArray + +ArrayLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = list[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = list[int] | NDArray[np.integer] | IntegerArray + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] +) + +class Buffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +SupportPyBuffer: TypeAlias = Any + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... diff --git a/pyarrow-stubs/_substrait.pyi b/pyarrow-stubs/_substrait.pyi index 38bd13ad5e0..46de8d4110b 100644 --- a/pyarrow-stubs/_substrait.pyi +++ b/pyarrow-stubs/_substrait.pyi @@ -1,12 +1,28 @@ from typing import Callable -from typing import NamedTuple -from pyarrow.lib import Buffer -from pyarrow.lib import RecordBatchReader -from pyarrow.lib import Table +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable -def _parse_json_plan(plan: bytes) -> Buffer: ... -def get_supported_functions() -> list[str]: ... def run_query( - plan: Buffer | bytes, table_provider: Callable[[NamedTuple], Table] | None = ... + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, ) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/pyarrow-stubs/acero.pyi b/pyarrow-stubs/acero.pyi new file mode 100644 index 00000000000..8b26f40b04f --- /dev/null +++ b/pyarrow-stubs/acero.pyi @@ -0,0 +1,75 @@ +from typing import Literal, Self, TypeAlias + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + +class ExecNodeOptions(lib._Weakrefable): ... + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/pyarrow-stubs/benchmark.pyi b/pyarrow-stubs/benchmark.pyi deleted file mode 100644 index 0d2a20d9ae7..00000000000 --- a/pyarrow-stubs/benchmark.pyi +++ /dev/null @@ -1 +0,0 @@ -from pyarrow.lib import benchmark_PandasObjectIsNull as benchmark_PandasObjectIsNull diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index ea778d56549..593d9f614cc 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1,14 +1,17 @@ -from typing import TypeVar +from typing import Literal, Sequence, TypeVar, overload -from numpy.typing import ArrayLike +# Option classes from pyarrow._compute import ArraySortOptions as ArraySortOptions from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions from pyarrow._compute import CastOptions as CastOptions from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions from pyarrow._compute import Expression as Expression from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions from pyarrow._compute import FilterOptions as FilterOptions @@ -20,27 +23,31 @@ from pyarrow._compute import HashAggregateKernel as HashAggregateKernel from pyarrow._compute import IndexOptions as IndexOptions from pyarrow._compute import JoinOptions as JoinOptions from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions from pyarrow._compute import MakeStructOptions as MakeStructOptions from pyarrow._compute import MapLookupOptions as MapLookupOptions from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions from pyarrow._compute import ModeOptions as ModeOptions from pyarrow._compute import NullOptions as NullOptions from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions from pyarrow._compute import PartitionNthOptions as PartitionNthOptions from pyarrow._compute import QuantileOptions as QuantileOptions from pyarrow._compute import RandomOptions as RandomOptions from pyarrow._compute import RankOptions as RankOptions from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions from pyarrow._compute import RoundOptions as RoundOptions from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions from pyarrow._compute import ScalarFunction as ScalarFunction from pyarrow._compute import ScalarKernel as ScalarKernel -from pyarrow._compute import ScalarUdfContext as ScalarUdfContext from pyarrow._compute import SelectKOptions as SelectKOptions from pyarrow._compute import SetLookupOptions as SetLookupOptions from pyarrow._compute import SliceOptions as SliceOptions @@ -53,74 +60,91 @@ from pyarrow._compute import StructFieldOptions as StructFieldOptions from pyarrow._compute import TakeOptions as TakeOptions from pyarrow._compute import TDigestOptions as TDigestOptions from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions from pyarrow._compute import VarianceOptions as VarianceOptions from pyarrow._compute import VectorFunction as VectorFunction from pyarrow._compute import VectorKernel as VectorKernel from pyarrow._compute import WeekOptions as WeekOptions + +# Functions from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function from pyarrow._compute import function_registry as function_registry from pyarrow._compute import get_function as get_function from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function from pyarrow._compute import register_scalar_function as register_scalar_function -from pyarrow.lib import Array -from pyarrow.lib import ChunkedArray -from pyarrow.lib import DataType -from pyarrow.lib import MemoryPool -from pyarrow.lib import RecordBatch -from pyarrow.lib import Scalar -from pyarrow.lib import Table -from pyarrow.vendored import docscrape as docscrape +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function +from pyarrow._stubs_typing import Indices + +from . import lib def cast( - arr: ArrayLike, - target_type: DataType | str | None = ..., - safe: bool | None = ..., - options: CastOptions | None = ..., -) -> Array: ... + arr: lib.Array, + target_type: str | lib.DataType, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... def index( - data: ArrayLike, - value: Scalar, - start: int | None = ..., - end: int | None = ..., + data: lib.Array, + value: lib.Scalar, + start: int | None = None, + end: int | None = None, *, - memory_pool: MemoryPool | None = ..., + memory_pool: lib.MemoryPool | None = None, ) -> int: ... -_TakeData = TypeVar("_TakeData", Array, ChunkedArray, RecordBatch, Table) +_DataT = TypeVar("_DataT", bound=lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table) def take( - data: _TakeData, - indices: Array | ChunkedArray, + data: _DataT, + indices: Indices, *, - boundscheck: bool = ..., - memory_pool: MemoryPool | None = ..., -) -> _TakeData: ... - -_FillValues = TypeVar("_FillValues", bound=Array | ChunkedArray | Scalar) -_FillValue = TypeVar("_FillValue", bound=Array | ChunkedArray | Scalar) - -def fill_null(values: _FillValues, fill_value: _FillValue) -> _FillValues: ... + boundscheck: bool = True, + memory_pool: lib.MemoryPool | None = None, +) -> _DataT: ... +def fill_null(values: _DataT, fill_value: lib.Array | lib.ChunkedArray | lib.Scalar) -> _DataT: ... +@overload def top_k_unstable( - values: Array | ChunkedArray | RecordBatch | Table, + values: lib.Array | lib.ChunkedArray | lib.RecordBatch, + k: int, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... +@overload +def top_k_unstable( + values: lib.Table, + k: int, + sort_keys: Sequence[str], + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... +@overload +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch, k: int, - sort_keys: list[str] | None = ..., *, - memory_pool: MemoryPool | None = ..., -) -> Array: ... + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... +@overload def bottom_k_unstable( - values: Array | ChunkedArray | RecordBatch | Table, + values: lib.Table, k: int, - sort_keys: list[str] | None = ..., + sort_keys: Sequence[str], *, - memory_pool: MemoryPool | None = ..., -) -> Array: ... + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... def random( n: int, *, - initializer: int | str = ..., - options: RandomOptions | None = ..., - memory_pool: MemoryPool | None = ..., -) -> Array: ... -def field(*name_or_index: int | str | tuple[int | str]): ... -def scalar(value: bool | int | float | str) -> Expression: ... + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... +def scalar(value: bool | float | str) -> Expression: ... diff --git a/pyarrow-stubs/csv.pyi b/pyarrow-stubs/csv.pyi deleted file mode 100644 index d1d481b9eaa..00000000000 --- a/pyarrow-stubs/csv.pyi +++ /dev/null @@ -1,11 +0,0 @@ -from pyarrow._csv import ISO8601 as ISO8601 -from pyarrow._csv import ConvertOptions as ConvertOptions -from pyarrow._csv import CSVStreamingReader as CSVStreamingReader -from pyarrow._csv import CSVWriter as CSVWriter -from pyarrow._csv import InvalidRow as InvalidRow -from pyarrow._csv import ParseOptions as ParseOptions -from pyarrow._csv import ReadOptions as ReadOptions -from pyarrow._csv import WriteOptions as WriteOptions -from pyarrow._csv import open_csv as open_csv -from pyarrow._csv import read_csv as read_csv -from pyarrow._csv import write_csv as write_csv diff --git a/pyarrow-stubs/cuda.pyi b/pyarrow-stubs/cuda.pyi deleted file mode 100644 index 2fd7051ae40..00000000000 --- a/pyarrow-stubs/cuda.pyi +++ /dev/null @@ -1,10 +0,0 @@ -from pyarrow._cuda import BufferReader as BufferReader -from pyarrow._cuda import BufferWriter as BufferWriter -from pyarrow._cuda import Context as Context -from pyarrow._cuda import CudaBuffer as CudaBuffer -from pyarrow._cuda import HostBuffer as HostBuffer -from pyarrow._cuda import IpcMemHandle as IpcMemHandle -from pyarrow._cuda import new_host_buffer as new_host_buffer -from pyarrow._cuda import read_message as read_message -from pyarrow._cuda import read_record_batch as read_record_batch -from pyarrow._cuda import serialize_record_batch as serialize_record_batch diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 453906a722d..1fcc4361c4a 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -1,97 +1,214 @@ -from os import PathLike -from typing import Callable -from typing import Iterable +from pathlib import Path +from typing import Callable, Iterable, Literal, TypeAlias, overload -from pyarrow._dataset import CsvFileFormat as CsvFileFormat -from pyarrow._dataset import CsvFragmentScanOptions as CsvFragmentScanOptions -from pyarrow._dataset import Dataset as Dataset -from pyarrow._dataset import DatasetFactory as DatasetFactory -from pyarrow._dataset import DirectoryPartitioning as DirectoryPartitioning -from pyarrow._dataset import FeatherFileFormat as FeatherFileFormat -from pyarrow._dataset import FileFormat as FileFormat -from pyarrow._dataset import FileFragment as FileFragment -from pyarrow._dataset import FilenamePartitioning as FilenamePartitioning -from pyarrow._dataset import FileSystemDataset as FileSystemDataset -from pyarrow._dataset import FileSystemDatasetFactory as FileSystemDatasetFactory -from pyarrow._dataset import FileSystemFactoryOptions as FileSystemFactoryOptions -from pyarrow._dataset import FileWriteOptions as FileWriteOptions -from pyarrow._dataset import Fragment as Fragment -from pyarrow._dataset import FragmentScanOptions as FragmentScanOptions -from pyarrow._dataset import HivePartitioning as HivePartitioning -from pyarrow._dataset import InMemoryDataset as InMemoryDataset -from pyarrow._dataset import IpcFileFormat as IpcFileFormat -from pyarrow._dataset import IpcFileWriteOptions as IpcFileWriteOptions -from pyarrow._dataset import Partitioning as Partitioning -from pyarrow._dataset import PartitioningFactory as PartitioningFactory -from pyarrow._dataset import Scanner as Scanner -from pyarrow._dataset import TaggedRecordBatch as TaggedRecordBatch -from pyarrow._dataset import UnionDataset as UnionDataset -from pyarrow._dataset import UnionDatasetFactory as UnionDatasetFactory -from pyarrow._dataset import WrittenFile as WrittenFile -from pyarrow._dataset_orc import OrcFileFormat as OrcFileFormat -from pyarrow._dataset_parquet import ParquetDatasetFactory as ParquetDatasetFactory -from pyarrow._dataset_parquet import ParquetFactoryOptions as ParquetFactoryOptions -from pyarrow._dataset_parquet import ParquetFileFormat as ParquetFileFormat -from pyarrow._dataset_parquet import ParquetFileFragment as ParquetFileFragment -from pyarrow._dataset_parquet import ParquetFileWriteOptions as ParquetFileWriteOptions -from pyarrow._dataset_parquet import ParquetFragmentScanOptions as ParquetFragmentScanOptions -from pyarrow._dataset_parquet import ParquetReadOptions as ParquetReadOptions -from pyarrow._dataset_parquet import RowGroupInfo as RowGroupInfo -from pyarrow.compute import Expression as Expression -from pyarrow.compute import field as field -from pyarrow.compute import scalar as scalar -from pyarrow.dataset import Dataset -from pyarrow.filesystem import FileSystem -from pyarrow.lib import Array -from pyarrow.lib import RecordBatch -from pyarrow.lib import RecordBatchReader -from pyarrow.lib import Schema -from pyarrow.lib import Table -from typing_extensions import Literal +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table -def __getattr__(name: str) -> None: ... +from ._fs import FileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... +@overload def partitioning( - schema: Schema | None = ..., - field_names: list[str] | None = ..., - flavor: str | None = ..., - dictionaries: dict[str, Array] | None = ..., -) -> Partitioning | PartitioningFactory: ... + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... def parquet_dataset( - metadata_path: str | PathLike, - schema: Schema | None = ..., - filesystem: FileSystem | str | None = ..., - format: ParquetFileFormat | str | None = ..., - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = ..., - partition_base_dir: str | None = ..., + metadata_path: str | Path, + schema: Schema | None = None, + filesystem: FileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: str | list[str] | Path | list[Path], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: FileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, ) -> FileSystemDataset: ... +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: FileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... +@overload def dataset( - source: str | Dataset | Iterable[str | Dataset | RecordBatch | RecordBatchReader], - schema: Schema | None = ..., - format: FileFormat | str | None = ..., - filesystem: FileSystem | str | None = ..., - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = ..., - partition_base_dir: str | None = ..., - exclude_invalid_files: bool | None = ..., - ignore_prefixes: list[str] | None = ..., -) -> Dataset: ... + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: FileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... def write_dataset( - data: Dataset | Table | RecordBatch | RecordBatchReader | Iterable[Table | RecordBatch], + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], base_dir: str, *, - basename_template: str | None = ..., - format: FileFormat | str | None = ..., - partitioning: Partitioning | list[str] | None = ..., - partitioning_flavor: str | None = ..., - schema: Schema | None = ..., - filesystem: FileSystem | None = ..., - file_options: FileWriteOptions | None = ..., - use_threads: bool = ..., - max_partitions: int | None = ..., - max_open_files: int | None = ..., - max_rows_per_file: int | None = ..., - min_rows_per_group: int | None = ..., - max_rows_per_group: int | None = ..., - file_visitor: Callable[[WrittenFile], None] | None = ..., - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = ..., - create_dir: bool = ..., -) -> None: ... + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: FileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi index 280bfd58f32..361d726ae0b 100644 --- a/pyarrow-stubs/feather.pyi +++ b/pyarrow-stubs/feather.pyi @@ -1,62 +1,49 @@ -from io import IOBase -from typing import overload +from typing import IO, Literal import pandas as pd -from pyarrow._feather import FeatherError as FeatherError -from pyarrow.lib import ChunkedArray -from pyarrow.lib import Codec as Codec -from pyarrow.lib import NativeFile -from pyarrow.lib import Schema -from pyarrow.lib import Table as Table -from pyarrow.lib import concat_tables as concat_tables -from pyarrow.lib import schema as schema -from pyarrow.vendored.version import Version as Version -from typing_extensions import Literal +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] class FeatherDataset: - paths: list[str] + path_or_paths: str | list[str] validate_schema: bool - schema: Schema - def __init__(self, path_or_paths: list[str], validate_schema: bool = ...) -> None: ... - def read_table(self, columns: list[str] | None = ...) -> Table: ... - def validate_schemas(self, piece: str, table: Table) -> None: ... + + def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... def read_pandas( - self, columns: list[str] | None = ..., use_threads: bool = ... + self, columns: list[str] | None = None, use_threads: bool = True ) -> pd.DataFrame: ... -def check_chunked_overflow(name: str, col: ChunkedArray) -> None: ... +def check_chunked_overflow(name: str, col) -> None: ... def write_feather( - df: pd.DataFrame, + df: pd.DataFrame | Table, dest: str, - compression: Literal["zstd", "lz4", "uncompressed"] | None = ..., - compression_level: int | None = ..., - chunksize: int | None = ..., - version: int = ..., + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, ) -> None: ... -@overload -def read_feather( - source: str, - columns: list[str] | None = ..., - use_threads: bool = ..., - memory_map: Literal[True] = ..., -) -> pd.DataFrame: ... -@overload def read_feather( - source: str | NativeFile | IOBase, - columns: list[str] | None = ..., - use_threads: bool = ..., + source: str | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, ) -> pd.DataFrame: ... -@overload -def read_table( - source: str | NativeFile | IOBase, - columns: list[str] | None = ..., - use_threads: bool = ..., -) -> Table: ... -@overload def read_table( - source: str, - columns: list[str] | None = ..., - memory_map: Literal[True] = ..., - use_threads: bool = ..., + source: str | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, ) -> Table: ... diff --git a/pyarrow-stubs/filesystem.pyi b/pyarrow-stubs/filesystem.pyi deleted file mode 100644 index 286ab48ceb7..00000000000 --- a/pyarrow-stubs/filesystem.pyi +++ /dev/null @@ -1,49 +0,0 @@ -from os import PathLike -from typing import Generator - -from pyarrow import Table -from pyarrow import parquet -from pyarrow._gcsfs import GcsFileSystem -from pyarrow._s3fs import S3FileSystem - -class FileSystem: - def cat(self, path: str) -> bytes: ... - def ls(self, path: str) -> list[str]: ... - def delete(self, path: str, recursive: bool = ...) -> None: ... - def disk_usage(self, path: str) -> int: ... - def stat(self, path: str) -> dict: ... - def rm(self, path: str, recursive: bool = ...): ... - def mv(self, path: str, new_path: str): ... - def rename(self, path: str, new_path: str) -> None: ... - def mkdir(self, path: str, create_parents: bool = ...) -> None: ... - def exists(self, path: str) -> bool: ... - def isdir(self, path: str) -> bool: ... - def isfile(self, path: str) -> bool: ... - def read_parquet( - self, - path: str, - columns: list[str] | None = ..., - metadata: parquet.FileMetaData | None = ..., # type: ignore - schema: parquet.ParquetSchema | None = ..., # type: ignore - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> Table: ... - def open(self, path: str, mode: str = ...) -> None: ... - @property - def pathsep(self) -> str: ... - -class LocalFileSystem(FileSystem): - def __init__(self) -> None: ... - @classmethod - def get_instance(cls) -> LocalFileSystem: ... - def walk(self, path: str) -> Generator[tuple[str, list[str], list[str]], None, None]: ... - -class DaskFileSystem(FileSystem): - fs: S3FileSystem | GcsFileSystem - def __init__(self, fs: S3FileSystem | GcsFileSystem) -> None: ... - -class S3FSWrapper(DaskFileSystem): ... - -def resolve_filesystem_and_path( - where: str | PathLike, filesystem: FileSystem | None = ... -) -> tuple[FileSystem | None, str]: ... diff --git a/pyarrow-stubs/flight.pyi b/pyarrow-stubs/flight.pyi index 5377495d085..9b806ccf305 100644 --- a/pyarrow-stubs/flight.pyi +++ b/pyarrow-stubs/flight.pyi @@ -1,45 +1,95 @@ -from pyarrow._flight import Action as Action -from pyarrow._flight import ActionType as ActionType -from pyarrow._flight import BasicAuth as BasicAuth -from pyarrow._flight import CallInfo as CallInfo -from pyarrow._flight import CertKeyPair as CertKeyPair -from pyarrow._flight import ClientAuthHandler as ClientAuthHandler -from pyarrow._flight import ClientMiddleware as ClientMiddleware -from pyarrow._flight import ClientMiddlewareFactory as ClientMiddlewareFactory -from pyarrow._flight import DescriptorType as DescriptorType -from pyarrow._flight import FlightCallOptions as FlightCallOptions -from pyarrow._flight import FlightCancelledError as FlightCancelledError -from pyarrow._flight import FlightClient as FlightClient -from pyarrow._flight import FlightDataStream as FlightDataStream -from pyarrow._flight import FlightDescriptor as FlightDescriptor -from pyarrow._flight import FlightEndpoint as FlightEndpoint -from pyarrow._flight import FlightError as FlightError -from pyarrow._flight import FlightInfo as FlightInfo -from pyarrow._flight import FlightInternalError as FlightInternalError -from pyarrow._flight import FlightMetadataReader as FlightMetadataReader -from pyarrow._flight import FlightMetadataWriter as FlightMetadataWriter -from pyarrow._flight import FlightMethod as FlightMethod -from pyarrow._flight import FlightServerBase as FlightServerBase -from pyarrow._flight import FlightServerError as FlightServerError -from pyarrow._flight import FlightStreamChunk as FlightStreamChunk -from pyarrow._flight import FlightStreamReader as FlightStreamReader -from pyarrow._flight import FlightStreamWriter as FlightStreamWriter -from pyarrow._flight import FlightTimedOutError as FlightTimedOutError -from pyarrow._flight import FlightUnauthenticatedError as FlightUnauthenticatedError -from pyarrow._flight import FlightUnauthorizedError as FlightUnauthorizedError -from pyarrow._flight import FlightUnavailableError as FlightUnavailableError -from pyarrow._flight import FlightWriteSizeExceededError as FlightWriteSizeExceededError -from pyarrow._flight import GeneratorStream as GeneratorStream -from pyarrow._flight import Location as Location -from pyarrow._flight import MetadataRecordBatchReader as MetadataRecordBatchReader -from pyarrow._flight import MetadataRecordBatchWriter as MetadataRecordBatchWriter -from pyarrow._flight import RecordBatchStream as RecordBatchStream -from pyarrow._flight import Result as Result -from pyarrow._flight import SchemaResult as SchemaResult -from pyarrow._flight import ServerAuthHandler as ServerAuthHandler -from pyarrow._flight import ServerCallContext as ServerCallContext -from pyarrow._flight import ServerMiddleware as ServerMiddleware -from pyarrow._flight import ServerMiddlewareFactory as ServerMiddlewareFactory -from pyarrow._flight import Ticket as Ticket -from pyarrow._flight import TracingServerMiddlewareFactory as TracingServerMiddlewareFactory -from pyarrow._flight import connect as connect +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi index 02e5e9cfd14..a185cb640c1 100644 --- a/pyarrow-stubs/fs.pyi +++ b/pyarrow-stubs/fs.pyi @@ -1,54 +1,76 @@ -from _typeshed import Incomplete -from pyarrow import PythonFile -from pyarrow._fs import FileInfo as FileInfo -from pyarrow._fs import FileSelector as FileSelector -from pyarrow._fs import FileSystem as FileSystem -from pyarrow._fs import FileSystemHandler as FileSystemHandler -from pyarrow._fs import FileType as FileType -from pyarrow._fs import LocalFileSystem as LocalFileSystem -from pyarrow._fs import PyFileSystem as PyFileSystem -from pyarrow._fs import SubTreeFileSystem as SubTreeFileSystem -from pyarrow._gcsfs import GcsFileSystem as GcsFileSystem -from pyarrow._hdfs import HadoopFileSystem as HadoopFileSystem -from pyarrow._s3fs import AwsDefaultS3RetryStrategy as AwsDefaultS3RetryStrategy -from pyarrow._s3fs import AwsStandardS3RetryStrategy as AwsStandardS3RetryStrategy -from pyarrow._s3fs import S3FileSystem as S3FileSystem -from pyarrow._s3fs import S3LogLevel as S3LogLevel -from pyarrow._s3fs import S3RetryStrategy as S3RetryStrategy -from pyarrow._s3fs import finalize_s3 as finalize_s3 -from pyarrow._s3fs import initialize_s3 as initialize_s3 -from pyarrow._s3fs import resolve_s3_region as resolve_s3_region +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) FileStats = FileInfo -def __getattr__(name: str) -> None: ... def copy_files( source: str, destination: str, - source_filesystem: FileSystem | None = ..., - destination_filesystem: FileSystem | None = ..., + source_filesystem: FileSystem | None = None, + destination_filesystem: FileSystem | None = None, *, - chunk_size: int = ..., - use_threads: bool = ..., + chunk_size: int = 1024 * 1024, + use_threads: bool = True, ) -> None: ... -class FSSpecHandler(FileSystemHandler): - fs: Incomplete - def __init__(self, fs) -> None: ... - def __eq__(self, other) -> bool: ... - def __ne__(self, other) -> bool: ... - def get_type_name(self) -> str: ... - def normalize_path(self, path: str) -> str: ... - def get_file_info(self, paths: list[str]) -> list[FileInfo]: ... - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... - def create_dir(self, path: str, recursive: bool) -> None: ... - def delete_dir(self, path: str) -> None: ... - def delete_dir_contents(self, path: str, missing_dir_ok: bool) -> None: ... # type: ignore - def delete_root_dir_contents(self) -> None: ... - def delete_file(self, path: str) -> None: ... - def move(self, src: str, dest: str) -> None: ... - def copy_file(self, src: str, dest: str) -> None: ... - def open_input_stream(self, path: str) -> PythonFile: ... - def open_input_file(self, path: str) -> PythonFile: ... - def open_output_stream(self, path: str, metadata: dict[str, str]) -> PythonFile: ... - def open_append_stream(self, path: str, metadata: dict[str, str]) -> PythonFile: ... +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: FileSystem + def __init__(self, fs: FileSystem) -> None: ... + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/pyarrow-stubs/gandiva.pyi b/pyarrow-stubs/gandiva.pyi new file mode 100644 index 00000000000..a344f885b29 --- /dev/null +++ b/pyarrow-stubs/gandiva.pyi @@ -0,0 +1,65 @@ +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/pyarrow-stubs/hdfs.pyi b/pyarrow-stubs/hdfs.pyi deleted file mode 100644 index 1e7f18b798f..00000000000 --- a/pyarrow-stubs/hdfs.pyi +++ /dev/null @@ -1,28 +0,0 @@ -from collections.abc import Generator - -import pyarrow._hdfsio as _hdfsio - -from _typeshed import Incomplete -from pyarrow.filesystem import FileSystem as FileSystem -from pyarrow.util import implements as implements - -class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): # type: ignore - def __init__( - self, - host: str = ..., - port: int = ..., - user: str | None = ..., - kerb_ticket: Incomplete | None = ..., - driver: str = ..., - extra_conf: Incomplete | None = ..., - ) -> None: ... - def __reduce__(self) -> tuple: ... - def walk(self, top_path: str) -> Generator[tuple[str, list[str], list[str]], None, None]: ... - -def connect( - host: str = ..., - port: int = ..., - user: Incomplete | None = ..., - kerb_ticket: Incomplete | None = ..., - extra_conf: Incomplete | None = ..., -): ... diff --git a/pyarrow-stubs/interchange/__init__.pyi b/pyarrow-stubs/interchange/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pyarrow-stubs/interchange/buffer.pyi b/pyarrow-stubs/interchange/buffer.pyi new file mode 100644 index 00000000000..50bbd3a1238 --- /dev/null +++ b/pyarrow-stubs/interchange/buffer.pyi @@ -0,0 +1,22 @@ +import enum + +from pyarrow.lib import Buffer + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class _PyArrowBuffer: + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: ... + @property + def ptr(self) -> int: ... + def __dlpack__(self): ... + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... diff --git a/pyarrow-stubs/interchange/column.pyi b/pyarrow-stubs/interchange/column.pyi new file mode 100644 index 00000000000..fd6600a604b --- /dev/null +++ b/pyarrow-stubs/interchange/column.pyi @@ -0,0 +1,62 @@ +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + +class ColumnNullType(enum.IntEnum): + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + +class NoBufferPresent(Exception): ... + +class _PyArrowColumn: + def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def size(self) -> int: ... + @property + def offset(self) -> int: ... + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: ... + @property + def describe_categorical(self) -> CategoricalDescription: ... + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: ... + @property + def null_count(self) -> int: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_chunks(self) -> int: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... + def get_buffers(self) -> ColumnBuffers: ... diff --git a/pyarrow-stubs/interchange/dataframe.pyi b/pyarrow-stubs/interchange/dataframe.pyi new file mode 100644 index 00000000000..880b8b6e80a --- /dev/null +++ b/pyarrow-stubs/interchange/dataframe.pyi @@ -0,0 +1,24 @@ +from typing import Any, Iterable, Self, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + +class _PyArrowDataFrame: + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def num_chunks(self) -> int: ... + def column_names(self) -> Iterable[str]: ... + def get_column(self, i: int) -> _PyArrowColumn: ... + def get_column_by_name(self, name: str) -> _PyArrowColumn: ... + def get_columns(self) -> Iterable[_PyArrowColumn]: ... + def select_columns(self, indices: Sequence[int]) -> Self: ... + def select_columns_by_name(self, names: Sequence[str]) -> Self: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... diff --git a/pyarrow-stubs/interchange/from_dataframe.pyi b/pyarrow-stubs/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..352bead7f25 --- /dev/null +++ b/pyarrow-stubs/interchange/from_dataframe.pyi @@ -0,0 +1,49 @@ +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + +ColumnObject: TypeAlias = Any + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: ... +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: ... +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... diff --git a/pyarrow-stubs/ipc.pyi b/pyarrow-stubs/ipc.pyi index 095ad106e2e..bedcaecaa5b 100644 --- a/pyarrow-stubs/ipc.pyi +++ b/pyarrow-stubs/ipc.pyi @@ -3,97 +3,121 @@ from io import IOBase import pandas as pd import pyarrow.lib as lib -from pyarrow import ipc -from pyarrow.lib import Buffer -from pyarrow.lib import IpcReadOptions as IpcReadOptions -from pyarrow.lib import IpcWriteOptions as IpcWriteOptions -from pyarrow.lib import MemoryPool -from pyarrow.lib import Message as Message -from pyarrow.lib import MessageReader as MessageReader -from pyarrow.lib import MetadataVersion as MetadataVersion -from pyarrow.lib import NativeFile -from pyarrow.lib import ReadStats as ReadStats -from pyarrow.lib import RecordBatchReader as RecordBatchReader -from pyarrow.lib import Schema -from pyarrow.lib import WriteStats as WriteStats -from pyarrow.lib import get_record_batch_size as get_record_batch_size -from pyarrow.lib import get_tensor_size as get_tensor_size -from pyarrow.lib import read_message as read_message -from pyarrow.lib import read_record_batch as read_record_batch -from pyarrow.lib import read_schema as read_schema -from pyarrow.lib import read_tensor as read_tensor -from pyarrow.lib import write_tensor as write_tensor +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) class RecordBatchStreamReader(lib._RecordBatchStreamReader): def __init__( self, - source: bytes | memoryview | Buffer | NativeFile | IOBase, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, *, - options: ipc.IpcReadOptions | None = ..., - memory_pool: MemoryPool | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> None: ... class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): def __init__( self, - sink: str | Buffer | NativeFile | IOBase, - schema: Schema, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, *, - use_legacy_format: bool | None = ..., - options: ipc.IpcWriteOptions | None = ..., + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, ) -> None: ... class RecordBatchFileReader(lib._RecordBatchFileReader): def __init__( self, - source: bytes | memoryview | Buffer | NativeFile | IOBase, - footer_offset: int | None = ..., + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, *, - options: ipc.IpcReadOptions | None = ..., - memory_pool: MemoryPool | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> None: ... class RecordBatchFileWriter(lib._RecordBatchFileWriter): def __init__( self, - sink: str | Buffer | NativeFile | IOBase, - schema: Schema, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, *, - use_legacy_format: bool | None = ..., - options: ipc.IpcWriteOptions | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> None: ... def new_stream( - sink: str | Buffer | NativeFile | IOBase, - schema: Schema, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, *, - use_legacy_format: bool | None = ..., - options: ipc.IpcWriteOptions | None = ..., + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, ) -> RecordBatchStreamWriter: ... def open_stream( - source: bytes | memoryview | Buffer | NativeFile | IOBase, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, *, - options: ipc.IpcReadOptions | None = ..., - memory_pool: MemoryPool | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchStreamReader: ... def new_file( - sink: str | NativeFile | IOBase, - schema: Schema, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, *, - use_legacy_format: bool | None = ..., - options: ipc.IpcWriteOptions | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchFileWriter: ... def open_file( - source: bytes | memoryview | Buffer | NativeFile | IOBase, - footer_offset: int | None = ..., + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, *, - options: ipc.IpcReadOptions | None = ..., - memory_pool: MemoryPool | None = ..., + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchFileReader: ... def serialize_pandas( - df: pd.DataFrame, - *, - nthreads: int | None = ..., - preserve_index: bool | None = ..., -) -> Buffer: ... -def deserialize_pandas(buf: memoryview | Buffer, *, use_threads: bool = ...) -> pd.DataFrame: ... + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... +def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/pyarrow-stubs/json.pyi b/pyarrow-stubs/json.pyi index 59f0939f480..0a1957e18af 100644 --- a/pyarrow-stubs/json.pyi +++ b/pyarrow-stubs/json.pyi @@ -1,3 +1,3 @@ -from pyarrow._json import ParseOptions as ParseOptions -from pyarrow._json import ReadOptions as ReadOptions -from pyarrow._json import read_json as read_json +from pyarrow._json import ParseOptions, ReadOptions, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json"] diff --git a/pyarrow-stubs/jvm.pyi b/pyarrow-stubs/jvm.pyi deleted file mode 100644 index 02372c5baa0..00000000000 --- a/pyarrow-stubs/jvm.pyi +++ /dev/null @@ -1,17 +0,0 @@ -from _typeshed import Incomplete -from pyarrow.lib import Array -from pyarrow.lib import Buffer -from pyarrow.lib import Field -from pyarrow.lib import RecordBatch -from pyarrow.lib import Schema - -class _JvmBufferNanny: - ref_manager: Incomplete - def __init__(self, jvm_buf) -> None: ... - def __del__(self) -> None: ... - -def jvm_buffer(jvm_buf) -> Buffer: ... -def field(jvm_field) -> Field: ... -def schema(jvm_schema) -> Schema: ... -def array(jvm_array) -> Array: ... -def record_batch(jvm_vector_schema_root) -> RecordBatch: ... diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index b0a7e3600f4..d6b6f7e51c7 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -1,2185 +1,70 @@ -import _io # type: ignore -import collections.abc -import datetime as dt -import enum -import importlib._bootstrap # type: ignore -import io - -from decimal import Decimal -from os import PathLike -from types import ModuleType -from typing import Any -from typing import Callable -from typing import ClassVar -from typing import Generator -from typing import Generic -from typing import ItemsView -from typing import Iterable -from typing import KeysView +# ruff: noqa: F403 from typing import NamedTuple -from typing import TypeVar -from typing import ValuesView -from typing import overload - -import numpy as np -import pandas as pd - -from numpy.typing import ArrayLike -from numpy.typing import DTypeLike -from numpy.typing import NDArray -from pyarrow.compute import CastOptions -from pyarrow.compute import FunctionOptions -from typing_extensions import Buffer as _Buffer -from typing_extensions import Literal -from typing_extensions import TypeAlias -from typing_extensions import TypeGuard - -_ArrowType: TypeAlias = int | DataType -_builtin_slice = slice -DEFAULT_BUFFER_SIZE: int -NA: NullScalar -Type_BINARY: _ArrowType -Type_BOOL: _ArrowType -Type_DATE32: _ArrowType -Type_DATE64: _ArrowType -Type_DECIMAL128: _ArrowType -Type_DECIMAL256: _ArrowType -Type_DENSE_UNION: _ArrowType -Type_DICTIONARY: _ArrowType -Type_DOUBLE: _ArrowType -Type_DURATION: _ArrowType -Type_FIXED_SIZE_BINARY: _ArrowType -Type_FIXED_SIZE_LIST: _ArrowType -Type_FLOAT: _ArrowType -Type_HALF_FLOAT: _ArrowType -Type_INT16: _ArrowType -Type_INT32: _ArrowType -Type_INT64: _ArrowType -Type_INT8: _ArrowType -Type_INTERVAL_MONTH_DAY_NANO: _ArrowType -Type_LARGE_BINARY: _ArrowType -Type_LARGE_LIST: _ArrowType -Type_LARGE_STRING: _ArrowType -Type_LIST: _ArrowType -Type_MAP: _ArrowType -Type_NA: _ArrowType -Type_SPARSE_UNION: _ArrowType -Type_STRING: _ArrowType -Type_STRUCT: _ArrowType -Type_TIME32: _ArrowType -Type_TIME64: _ArrowType -Type_TIMESTAMP: _ArrowType -Type_UINT16: _ArrowType -Type_UINT32: _ArrowType -Type_UINT64: _ArrowType -Type_UINT8: _ArrowType -UnionMode_DENSE: int -UnionMode_SPARSE: int -V1: importlib._bootstrap.MetadataVersion -V2: importlib._bootstrap.MetadataVersion -V3: importlib._bootstrap.MetadataVersion -V4: importlib._bootstrap.MetadataVersion -V5: importlib._bootstrap.MetadataVersion -_NULL: NullScalar -__pc: ModuleType | None -_break_traceback_cycle_from_frame: Callable -_default_context_initialized: bool -_default_serialization_context: SerializationContext -_is_path_like: Callable -_pandas_api: _PandasAPIShim -_python_extension_types_registry: list -_registry_nanny: _ExtensionRegistryNanny -_stringify_path: Callable -contextmanager: Callable -cpp_build_info: importlib._bootstrap.BuildInfo -cpp_version: str -cpp_version_info: importlib._bootstrap.VersionInfo -have_signal_refcycle: bool -namedtuple: Callable -builtin_pickle: Callable - -class PyCapsule: ... - -_Self = TypeVar("_Self") - -_Array = TypeVar("_Array", bound=Array) -_ChunkedArray = TypeVar("_ChunkedArray", bound=ChunkedArray) - -_T = TypeVar("_T") -_T2 = TypeVar("_T2") -_Scalar = TypeVar("_Scalar", bound=Scalar) - -class Array(_PandasConvertibleToSeries, Generic[_T, _Scalar]): - _name: Any - nbytes: int - null_count: int - offset: int - type: DataType[_T] - def __init__(self) -> None: ... - def _debug_print(self) -> Any: ... - @staticmethod - def _export_to_c(out_ptr: int, out_schema_ptr: int | None = ...) -> Array: ... - @staticmethod - def _import_from_c(in_ptr: int, type: DataType | int) -> Array: ... - def _to_pandas( - self, - options: dict[str, Any], - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype | None] | None = ..., - **kwargs, - ) -> pd.Series: ... - def buffers(self) -> list[Buffer | None]: ... - @overload - def cast( - self, - target_type: Literal["bool", "boolean"], - safe: bool = ..., - options: CastOptions = ..., - ) -> BooleanArray: ... - @overload - def cast( - self, - target_type: Literal["i1", "int8"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Int8Array: ... - @overload - def cast( - self, - target_type: Literal["i2", "int16"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Int16Array: ... - @overload - def cast( - self, - target_type: Literal["i4", "int32"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Int32Array: ... - @overload - def cast( - self, - target_type: Literal["i8", "int64"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Int64Array: ... - @overload - def cast( - self, - target_type: Literal["u1", "uint8"], - safe: bool = ..., - options: CastOptions = ..., - ) -> UInt8Array: ... - @overload - def cast( - self, - target_type: Literal["u2", "uint16"], - safe: bool = ..., - options: CastOptions = ..., - ) -> UInt16Array: ... - @overload - def cast( - self, - target_type: Literal["u4", "uint32"], - safe: bool = ..., - options: CastOptions = ..., - ) -> UInt32Array: ... - @overload - def cast( - self, - target_type: Literal["u8", "uint64"], - safe: bool = ..., - options: CastOptions = ..., - ) -> UInt64Array: ... - @overload - def cast( - self, - target_type: Literal["f2", "halffloat", "float16"], - safe: bool = ..., - options: CastOptions = ..., - ) -> HalfFloatArray: ... - @overload - def cast( - self, - target_type: Literal["f4", "float", "float32"], - safe: bool = ..., - options: CastOptions = ..., - ) -> FloatArray: ... - @overload - def cast( - self, - target_type: Literal["f8", "double", "float64"], - safe: bool = ..., - options: CastOptions = ..., - ) -> DoubleArray: ... - @overload - def cast( - self, - target_type: Literal["string", "str", "utf8"], - safe: bool = ..., - options: CastOptions = ..., - ) -> StringArray: ... - @overload - def cast( - self, - target_type: Literal["binary"], - safe: bool = ..., - options: CastOptions = ..., - ) -> BinaryArray: ... - @overload - def cast( - self, - target_type: Literal["large_string", "large_str", "large_utf8"], - safe: bool = ..., - options: CastOptions = ..., - ) -> LargeStringArray: ... - @overload - def cast( - self, - target_type: Literal["large_binary"], - safe: bool = ..., - options: CastOptions = ..., - ) -> LargeBinaryArray: ... - @overload - def cast( - self, - target_type: Literal["date32", "date32[day]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Date32Array: ... - @overload - def cast( - self, - target_type: Literal["date64", "date64[ms]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Date64Array: ... - @overload - def cast( - self, - target_type: Literal["time32[s]", "time32[ms]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Time32Array: ... - @overload - def cast( - self, - target_type: Literal["time64[us]", "time64[ns]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> Time64Array: ... - @overload - def cast( - self, - target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> TimestampArray: ... - @overload - def cast( - self, - target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], - safe: bool = ..., - options: CastOptions = ..., - ) -> DurationArray: ... - @overload - def cast( - self, - target_type: Literal["month_day_nano_interval"], - safe: bool = ..., - options: CastOptions = ..., - ) -> MonthDayNanoIntervalArray: ... - @overload - def cast( - self, - target_type: DataType[_T2] | None = ..., - safe: bool = ..., - options: CastOptions = ..., - ) -> Array[_T2, Scalar[_T2]]: ... - def dictionary_encode(self, null_encoding: str = ...) -> DictionaryArray: ... - def diff(self, other: Array) -> str: ... - def drop_null(self: _Array) -> _Array: ... - def equals(self, other: Array) -> bool: ... - def fill_null(self: _Array, fill_value: _T) -> _Array: ... - def filter( - self: _Array, - mask: list[bool] | BooleanArray, - *, - null_selection_behavior: Literal["drop", "emit_null"] = ..., - ) -> _Array: ... - def format(self, **kwargs) -> Any: ... - @staticmethod - def from_buffers( - type: DataType, - length: int, - buffers: list[Buffer], - null_count: int = ..., - offset: int = ..., - children: list[_Array] = ..., - ) -> _Array: ... - @staticmethod - def from_pandas( - obj: pd.Series | ArrayLike, - mask: BooleanArray = ..., - type: DataType[_T2] = ..., - safe: bool = ..., - memory_pool: MemoryPool = ..., - ) -> Array[_T2, Scalar[_T2]] | ChunkedArray[_T2, Scalar[_T2]]: ... - def get_total_buffer_size(self) -> int: ... - def index( - self, - value: Scalar | object, - start: int | None = ..., - end: int | None = ..., - *, - memory_pool: MemoryPool | None = ..., - ) -> Int64Scalar: ... - def is_null(self, *, nan_is_null: bool = ...) -> BooleanArray: ... - def is_valid(self) -> BooleanArray: ... - def slice(self: _Array, offset: int = ..., length: int | None = ...) -> _Array: ... - def sum(self, **kwargs) -> Any: ... - def take( - self: _Array, - indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], - ) -> _Array: ... - def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> NDArray: ... - def to_pylist(self) -> list[_T]: ... - def to_string( - self, - *, - indent: int = ..., - top_level_indent: int = ..., - window: int = ..., - container_window: int = ..., - skip_new_lines: bool = ..., - ) -> str: ... - def tolist(self) -> list[_T]: ... - def unique(self: _Array) -> _Array: ... - def validate(self, *, full: bool = ...) -> None: ... - def value_counts(self) -> StructArray: ... - @overload - def view(self, target_type: Literal["bool", "boolean"]) -> BooleanArray: ... - @overload - def view(self, target_type: Literal["i1", "int8"]) -> Int8Array: ... - @overload - def view(self, target_type: Literal["i2", "int16"]) -> Int16Array: ... - @overload - def view(self, target_type: Literal["i4", "int32"]) -> Int32Array: ... - @overload - def view(self, target_type: Literal["i8", "int64"]) -> Int64Array: ... - @overload - def view(self, target_type: Literal["u1", "uint8"]) -> UInt8Array: ... - @overload - def view(self, target_type: Literal["u2", "uint16"]) -> UInt16Array: ... - @overload - def view(self, target_type: Literal["u4", "uint32"]) -> UInt32Array: ... - @overload - def view(self, target_type: Literal["u8", "uint64"]) -> UInt64Array: ... - @overload - def view(self, target_type: Literal["f2", "halffloat", "float16"]) -> HalfFloatArray: ... - @overload - def view(self, target_type: Literal["f4", "float", "float32"]) -> FloatArray: ... - @overload - def view(self, target_type: Literal["f8", "double", "float64"]) -> DoubleArray: ... - @overload - def view(self, target_type: Literal["string", "str", "utf8"]) -> StringArray: ... - @overload - def view(self, target_type: Literal["binary"]) -> BinaryArray: ... - @overload - def view( - self, target_type: Literal["large_string", "large_str", "large_utf8"] - ) -> LargeStringArray: ... - @overload - def view(self, target_type: Literal["large_binary"]) -> LargeBinaryArray: ... - @overload - def view(self, target_type: Literal["date32", "date32[day]"]) -> Date32Array: ... - @overload - def view(self, target_type: Literal["date64", "date64[ms]"]) -> Date64Array: ... - @overload - def view(self, target_type: Literal["time32[s]", "time32[ms]"]) -> Time32Array: ... - @overload - def view(self, target_type: Literal["time64[us]", "time64[ns]"]) -> Time64Array: ... - @overload - def view( - self, - target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], - ) -> TimestampArray: ... - @overload - def view( - self, - target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], - ) -> DurationArray: ... - @overload - def view( - self, - target_type: Literal["month_day_nano_interval"], - ) -> MonthDayNanoIntervalArray: ... - @overload - def view(self, target_type: DataType[_T2]) -> Array[_T2, Scalar[_T2]]: ... - def __array__(self, dtype: DTypeLike = ...) -> NDArray: ... - def __eq__(self, other) -> bool: ... - @overload - def __getitem__(self, key: int) -> _Scalar: ... - @overload - def __getitem__(self: _Array, key: _builtin_slice) -> _Array: ... - def __iter__(self) -> Generator[_Scalar, None, None]: ... - def __len__(self) -> int: ... - def __sizeof__(self) -> int: ... - -class ArrowCancelled(ArrowException): - def __init__(self, message: str, signum: int = ...) -> None: ... - -class ArrowCapacityError(ArrowException): ... -class ArrowException(Exception): ... -class ArrowIndexError(IndexError, ArrowException): ... -class ArrowInvalid(ValueError, ArrowException): ... -class ArrowKeyError(KeyError, ArrowException): ... -class ArrowMemoryError(MemoryError, ArrowException): ... -class ArrowNotImplementedError(NotImplementedError, ArrowException): ... -class ArrowSerializationError(ArrowException): ... -class ArrowTypeError(TypeError, ArrowException): ... - -ArrowIOError = IOError - -class BaseExtensionType(DataType[_T]): - extension_name: str - storage_type: DataType[_T] - def __init__(self, *args, **kwargs) -> None: ... - @overload - def wrap_array( - self, storage: Array[_T2, _Scalar] - ) -> ExtensionArray[_T, ExtensionScalar[_T], Array[_T2, _Scalar]]: ... - @overload - def wrap_array( - self, storage: ChunkedArray[_T2, _Scalar] - ) -> ChunkedArray[_T, ExtensionScalar[_T]]: ... - -class BaseListArray(Array[list[_T], _Scalar]): - def __init__(self, *args, **kwargs) -> None: ... - def flatten(self) -> Array[_T, _Scalar]: ... - def value_lengths(self) -> Int32Array: ... - def value_parent_indices(self) -> Int64Array: ... - -class BinaryArray(Array[_T, BinaryScalar]): - total_values_length: int - -class BinaryScalar(Scalar[_T]): - def as_buffer(self) -> Buffer: ... - def as_py(self) -> _T: ... - -class BooleanArray(Array[bool, BooleanScalar]): - false_count: int - true_count: int - -class BooleanScalar(Scalar[bool]): ... - -class Buffer(_Weakrefable, _Buffer): - address: int - is_cpu: bool - is_mutable: bool - parent: Buffer | None - size: int - def equals(self, other) -> bool: ... - def hex(self) -> bytes: ... - def slice(self, offset: int = ..., length: int | None = ...) -> Buffer: ... - def to_pybytes(self) -> bytes: ... - def __eq__(self, other) -> bool: ... - @overload - def __getitem__(self, key: int) -> int: ... - @overload - def __getitem__(self, key: _builtin_slice) -> Buffer: ... - def __len__(self) -> int: ... - -class BufferOutputStream(NativeFile): - def getvalue(self) -> Buffer: ... - -class BufferReader(NativeFile): ... -class BufferedIOBase(_io._BufferedIOBase, io.IOBase): ... - -class BufferedInputStream(NativeFile): - def __init__( - self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = ... - ) -> None: ... - def detach(self) -> NativeFile: ... - -class BufferedOutputStream(NativeFile): - def __init__( - self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = ... - ) -> None: ... - def detach(self) -> NativeFile: ... - -class BuildInfo(NamedTuple): - build_type: str - compiler_flags: str - compiler_id: str - compiler_version: str - full_so_version: str - git_description: str - git_id: str - package_kind: str - so_version: str - version: str - version_info: str - -class ChunkedArray(_PandasConvertibleToSeries, Generic[_T, _Scalar]): - _name: str | None - chunks: list[Array[_T, _Scalar]] - nbytes: int - null_count: int - num_chunks: int - type: DataType[_T] - @property - def data(self: _ChunkedArray) -> _ChunkedArray: ... - def _to_pandas(self, options, types_mapper=..., **kwargs) -> Any: ... - def cast(self, target_type=..., safe=..., options=...) -> Any: ... - def chunk(self, i: int) -> Array[_T, _Scalar]: ... - def combine_chunks(self, memory_pool: MemoryPool | None = ...) -> Table: ... - def dictionary_encode(self: _ChunkedArray, null_encoding: str = ...) -> _ChunkedArray: ... - def drop_null(self: _ChunkedArray) -> _ChunkedArray: ... - def equals(self, other) -> bool: ... - def fill_null(self: _ChunkedArray, fill_value: _T) -> _ChunkedArray: ... - def filter( - self: _ChunkedArray, - mask: list[bool] | BooleanArray, - *, - null_selection_behavior: Literal["drop", "emit_null"] = ..., - ) -> _ChunkedArray: ... - def flatten(self: _ChunkedArray, memory_pool: MemoryPool | None = ...) -> _ChunkedArray: ... - def format(self, **kwargs) -> str: ... - def get_total_buffer_size(self) -> int: ... - def index( - self, - value: Scalar | object, - start: int | None = ..., - end: int | None = ..., - *, - memory_pool: MemoryPool | None = ..., - ) -> Int64Scalar: ... - def is_null(self) -> ChunkedArray[bool, BooleanScalar]: ... - def is_valid(self) -> ChunkedArray[bool, BooleanScalar]: ... - def iterchunks(self) -> Generator[Array[_T, _Scalar], None, None]: ... - def length(self) -> int: ... - def slice( - self: _ChunkedArray, offset: int = ..., length: int | None = ... - ) -> _ChunkedArray: ... - def take( - self: _ChunkedArray, - indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], - ) -> _ChunkedArray: ... - def to_numpy(self) -> NDArray: ... - def to_pylist(self) -> list[_T]: ... - def to_string( - self, - *, - indent: int = ..., - window: int = ..., - container_window: int = ..., - skip_new_lines: bool = ..., - ) -> str: ... - def unify_dictionaries( - self: _ChunkedArray, memory_pool: MemoryPool = ... - ) -> _ChunkedArray: ... - def unique(self) -> ChunkedArray[int, Int64Scalar]: ... - def validate(self, *, full: bool = ...) -> None: ... - def value_counts(self) -> StructArray: ... - def __array__(self, dtype: DTypeLike = ...) -> NDArray: ... - def __eq__(self, other) -> bool: ... - @overload - def __getitem__(self, key: int) -> _Scalar: ... - @overload - def __getitem__(self: _ChunkedArray, key: _builtin_slice) -> _ChunkedArray: ... - def __iter__(self) -> Generator[_Scalar, None, None]: ... - def __len__(self) -> int: ... - def __sizeof__(self) -> int: ... - -_COMPRESSION: TypeAlias = Literal[ - "gzip", "bz2", "brotli", "lz4" "lz4_frame", "lz4_raw", "zstd", "snappy" -] - -class Codec(_Weakrefable): - compression_level: int | None - name: str - def __init__( - self, - compression: _COMPRESSION, - compression_level: int | None = ..., - ) -> None: ... - @overload - def compress( - self, - buf: Buffer | bytes | memoryview, - memory_pool: MemoryPool | None = ..., - ) -> Buffer: ... - @overload - def compress( - self, - buf: Buffer | bytes | memoryview, - asbytes: Literal[True] = ..., - memory_pool: MemoryPool | None = ..., - ) -> bytes: ... - def decompress(self, buf, decompressed_size=..., asbytes=..., memory_pool=...) -> Any: ... - @staticmethod - def default_compression_level(compression: _COMPRESSION) -> int: ... - @staticmethod - def detect(path: str | PathLike) -> Codec: ... - @staticmethod - def is_available(compression: _COMPRESSION) -> bool: ... - @staticmethod - def maximum_compression_level(compression: _COMPRESSION) -> int: ... - @staticmethod - def minimum_compression_level(compression: _COMPRESSION) -> int: ... - @staticmethod - def supports_compression_level(compression: _COMPRESSION) -> bool: ... - -class CompressedInputStream(NativeFile): - def __init__( - self, - stream: str | PathLike | NativeFile | IOBase, - compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... - -class CompressedOutputStream(NativeFile): - def __init__( - self, - stream: str | PathLike | NativeFile | IOBase, - compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... - -class DataType(_Weakrefable, Generic[_T]): - bit_width: int - id: int - num_buffers: int - num_fields: int - def _export_to_c(self, out_ptr: int) -> None: ... - def _import_from_c(self, in_ptr: int) -> Any: ... - def equals(self, other) -> bool: ... - def field(self, i: int) -> Field: ... - def to_pandas_dtype(self) -> DTypeLike: ... - def __eq__(self, other) -> bool: ... - def __hash__(self) -> int: ... - -class Date32Array(NumericArray[dt.date, Date32Scalar]): ... -class Date32Scalar(Scalar[dt.date]): ... -class Date64Array(NumericArray[dt.date, Date64Scalar]): ... -class Date64Scalar(Scalar[dt.date]): ... -class Decimal128Array(FixedSizeBinaryArray): ... -class Decimal128Scalar(Scalar[Decimal]): ... - -class Decimal128Type(FixedSizeBinaryType): - precision: int - scale: int - -class Decimal256Array(FixedSizeBinaryArray): ... -class Decimal256Scalar(Scalar[Decimal]): ... - -class Decimal256Type(FixedSizeBinaryType): - precision: int - scale: int - -class DenseUnionType(UnionType): ... - -class DeserializationCallbackError(ArrowSerializationError): - def __init__(self, message: str, type_id) -> None: ... - -class DictionaryArray(Array[dict, DictionaryScalar]): - dictionary: Any - indices: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def dictionary_decode(self: _Array) -> _Array: ... - def dictionary_encode(self) -> DictionaryArray: ... # type: ignore - @staticmethod - def from_arrays( - indices: Array | NDArray | pd.Series, - dictionary: Array | NDArray | pd.Series, - mask: NDArray | pd.Series = ..., - ordered: bool = ..., - from_pandas: bool = ..., - safe: bool = ..., - memory_pool: MemoryPool = ..., - ) -> DictionaryArray: ... - @staticmethod - def from_buffers( # type: ignore - type: DataType, - length: int, - buffers: list[Buffer], - dictionary: Array | NDArray | pd.Series, - null_count: int = ..., - offset: int = ..., - ) -> DictionaryArray: ... - -class DictionaryMemo(_Weakrefable): ... - -class DictionaryScalar(Scalar[dict]): - dictionary: Any - index: Any - value: Any - -class DictionaryType(DataType): - index_type: Any - ordered: Any - value_type: Any - -class DoubleArray(FloatingPointArray[DoubleScalar]): ... -class DoubleScalar(Scalar[float]): ... -class DurationArray(NumericArray[dt.timedelta, DurationScalar]): ... - -class DurationScalar(Scalar[dt.timedelta]): - value: Any - -class DurationType(DataType[dt.timedelta]): - unit: Literal["s", "ms", "us", "ns"] - -_StorageArray = TypeVar("_StorageArray", bound=Array) - -class ExtensionArray(Array, Generic[_T, _Scalar, _StorageArray]): - storage: _StorageArray - @staticmethod - def from_storage( - typ: BaseExtensionType[_T], value: Array[_T, Scalar[_T]] - ) -> ExtensionArray[_T, _Scalar, Array[_T, Scalar[_T]]]: ... - -class ExtensionScalar(Scalar[_T]): - value: Scalar[_T] - @staticmethod - def from_storage(self, typ: BaseExtensionType[_T], value: object) -> ExtensionScalar[_T]: ... - -class ExtensionType(BaseExtensionType[_T]): - def __init__(self, storage_type: DataType[_T], extension_name: str) -> None: ... - def __arrow_ext_class__(self) -> type[ExtensionArray]: ... - @classmethod - def __arrow_ext_deserialize__( - cls, storage_type: DataType[_T], serialized - ) -> ExtensionType[_T]: ... - def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... - def __arrow_ext_serialize__(self) -> bytes: ... - def __eq__(self, other) -> bool: ... - -_Field = TypeVar("_Field", bound=Field) - -class Field(_Weakrefable, Generic[_T]): - metadata: dict - name: str - nullable: bool - type: DataType[_T] - def _export_to_c(self, out_ptr: int) -> None: ... - def _import_from_c(self, in_ptr: int) -> None: ... - def equals(self, other: Field, check_metadata: bool = ...) -> bool: ... - def flatten(self) -> list[Field]: ... - def remove_metadata(self: _Field) -> _Field: ... - def with_metadata(self: _Field, metadata: dict[str, str]) -> _Field: ... - def with_name(self: _Field, name: str) -> _Field: ... - def with_nullable(self: _Field, nullable) -> _Field: ... - def with_type(self, new_type: DataType[_T2]) -> Field[_T2]: ... - def __eq__(self, other) -> bool: ... - -class FixedSizeBinaryArray(Array[_T, FixedSizeBinaryScalar]): ... -class FixedSizeBinaryScalar(BinaryScalar[_T]): ... - -class FixedSizeBinaryType(DataType[_T]): - byte_width: int - -class FixedSizeBufferWriter(NativeFile): - def __init__(self, buffer: Buffer) -> None: ... - def set_memcopy_blocksize(self, blocksize: int) -> None: ... - def set_memcopy_threads(self, num_threads: int) -> None: ... - def set_memcopy_threshold(self, threshold: int) -> None: ... - -_Values = TypeVar("_Values", bound=Array) - -class FixedSizeListArray(BaseListArray, Generic[_T, _Scalar, _Values]): - values: _Values - @overload - @staticmethod - def from_arrays( - values: _Values, type: DataType[_T] | None = ... - ) -> FixedSizeListArray[_T, Scalar[_T], _Values]: ... - @overload - @staticmethod - def from_arrays( - values: Array[_T, Scalar[_T]], list_size: int | None = ... - ) -> FixedSizeListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... - -class FixedSizeListScalar(ListScalar[_T]): ... - -class FixedSizeListType(DataType[list[_T]]): - list_size: int - value_field: Field[_T] - value_type: DataType[_T] - -class FloatArray(FloatingPointArray[FloatScalar]): ... -class FloatScalar(Scalar[float]): ... -class FloatingPointArray(NumericArray[float, _Scalar]): ... -class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): ... -class HalfFloatScalar(Scalar[float]): ... -class IOBase(_io._IOBase): ... -class Int16Array(IntegerArray[Int16Scalar]): ... -class Int16Scalar(Scalar[int]): ... -class Int32Array(IntegerArray[Int32Scalar]): ... -class Int32Scalar(Scalar[int]): ... -class Int64Array(IntegerArray[Int64Scalar]): ... -class Int64Scalar(Scalar[int]): ... -class Int8Array(IntegerArray[Int8Scalar]): ... -class Int8Scalar(Scalar[int]): ... -class IntegerArray(NumericArray[int, _Scalar]): ... - -class IpcReadOptions(_Weakrefable): - ensure_native_endian: bool - included_fields: list | None - use_threads: bool - def __init__( - self, - *, - use_threads: bool = ..., - ensure_native_endian: bool = ..., - include_fields: list | None = ..., - ) -> None: ... - -class IpcWriteOptions(_Weakrefable): - allow_64bit: bool - compression: str | Codec | None - emit_dictionary_deltas: bool - metadata_version: MetadataVersion - unify_dictionaries: bool - use_legacy_format: bool - use_threads: bool - def __init__( - self, - *, - metadata_version: MetadataVersion = ..., - allow_64bit: bool = ..., - use_legacy_format: bool = ..., - compression: str | Codec | None = ..., - use_threads: bool = ..., - emit_dictionary_details: bool = ..., - unify_dictionaries: bool = ..., - ) -> None: ... - -class KeyValueMetadata(_Metadata, collections.abc.Mapping): - def __init__(self, __arg0__: dict | None = ..., **kwargs) -> None: ... - def equals(self, other) -> bool: ... - def get_all(self, key: str) -> list: ... - def items(self) -> ItemsView[str, Any]: ... - def key(self, i: int) -> str: ... - def keys(self) -> KeysView[str]: ... - def to_dict(self) -> dict: ... - def value(self, i: int) -> Any: ... - def values(self) -> ValuesView[Any]: ... - def __contains__(self, other) -> bool: ... - def __eq__(self, other) -> bool: ... - def __getitem__(self, key: str) -> Any: ... - def __iter__(self) -> Generator[str, None, None]: ... - def __len__(self) -> int: ... - -class LargeBinaryArray(Array[bytes, LargeBinaryScalar]): - total_values_length: int - -class LargeBinaryScalar(BinaryScalar[bytes]): ... - -class LargeListArray(BaseListArray, Generic[_T, _Scalar, _Values]): - offsets: int - values: _Values - @staticmethod - @overload - def from_arrays( - offsets: Int64Array, - values: Array[_T, Scalar[_T]], - pool: MemoryPool | None = ..., - mask: bool | None = ..., - ) -> LargeListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... - @staticmethod - @overload - def from_arrays( - offsets: Int64Array, - values: _Array, - type: DataType[_T], - pool: MemoryPool | None = ..., - mask: bool | None = ..., - ) -> LargeListArray[_T, Scalar[_T], _Array]: ... - -class LargeListScalar(ListScalar[_T]): ... - -class LargeListType(DataType[list[_T]]): - value_field: Field[_T] - value_type: DataType[_T] - -class LargeStringArray(Array[str, LargeStringScalar]): - @staticmethod - def from_buffers( # type: ignore - length: int, - value_offsets: Buffer, - data: Buffer, - null_bitmap: Buffer | None = ..., - null_count: int = ..., - offset: int = ..., - ) -> LargeStringArray: ... - -class LargeStringScalar(StringScalar): ... - -class ListArray(BaseListArray, Generic[_T, _Scalar, _Values]): - offsets: int - values: _Values - - @staticmethod - @overload - def from_arrays( - offsets: Int32Array, - values: Array[_T, Scalar[_T]], - pool: MemoryPool | None = ..., - mask: bool | None = ..., - ) -> ListArray[_T, Scalar[_T], Array[_T, Scalar[_T]]]: ... - @staticmethod - @overload - def from_arrays( - offsets: Int32Array, - values: _Array, - type: DataType[_T], - pool: MemoryPool | None = ..., - mask: bool | None = ..., - ) -> ListArray[_T, Scalar[_T], _Array]: ... - -class ListScalar(Scalar[list[_T]]): - values: list[_T] - -class ListType(DataType[list[_T]]): - value_field: Field[_T] - value_type: _T - -class LoggingMemoryPool(MemoryPool): ... - -_Key = TypeVar("_Key") -_Item = TypeVar("_Item") -class MapArray(ListArray[dict[_Key, _Item], MapScalar, StructArray], Generic[_Key, _Item]): - items: Array[_Item, Scalar[_Item]] - keys: Array[_Key, Scalar[_Key]] - - @staticmethod - def from_arrays( # type: ignore - offsets: Int32Array, - keys: Array[_Key, Scalar[_Key]] | list[_Key], - items: Array[_Item, Scalar[_Item]] | list[_Item], - pool: MemoryPool | None = ..., - ) -> MapArray[_Key, _Item]: ... - -class MapScalar(ListScalar[dict[_Key, _Item]]): ... - -class MapType(DataType[dict[_Key, _Item]]): - item_field: Field[_Item] - item_type: DataType[_Item] - key_field: Field[_Key] - key_type: DataType[_Key] - -class MemoryMappedFile(NativeFile): - def _open(self, path: str, mode: Literal["r", "r+", "w"] = ...) -> Any: ... - @staticmethod - def create(path: str, size: int) -> MemoryMappedFile: ... - def fileno(self) -> int: ... - def resize(self, new_size: int) -> None: ... - -class MemoryPool(_Weakrefable): - backend_name: str - def bytes_allocated(self) -> int: ... - def max_memory(self) -> int: ... - def release_unused(self) -> None: ... - -class Message(_Weakrefable): - body: Any - metadata: Any - metadata_version: MetadataVersion - type: str - def equals(self, other: Message) -> bool: ... - def serialize(self, alignment: int = ..., memory_pool: MemoryPool | None = ...) -> Any: ... - def serialize_to( - self, - sink: NativeFile, - alignment: int = ..., - memory_pool: MemoryPool | None = ..., - ) -> None: ... - -class MessageReader(_Weakrefable): - @staticmethod - def open_stream(source) -> MessageReader: ... - def read_next_message(self) -> Message: ... - def __iter__(self) -> Generator[Message, None, None]: ... - -class MetadataVersion(enum.IntEnum): - V1: ClassVar[importlib._bootstrap.MetadataVersion] = ... - V2: ClassVar[importlib._bootstrap.MetadataVersion] = ... - V3: ClassVar[importlib._bootstrap.MetadataVersion] = ... - V4: ClassVar[importlib._bootstrap.MetadataVersion] = ... - V5: ClassVar[importlib._bootstrap.MetadataVersion] = ... - -class MockOutputStream(NativeFile): - def size(self) -> int: ... +from .__lib_pxi.array import * +from .__lib_pxi.benchmark import * +from .__lib_pxi.builder import * +from .__lib_pxi.compat import * +from .__lib_pxi.config import * +from .__lib_pxi.device import * +from .__lib_pxi.error import * +from .__lib_pxi.io import * +from .__lib_pxi.ipc import * +from .__lib_pxi.memory import * +from .__lib_pxi.pandas_shim import * +from .__lib_pxi.scalar import * +from .__lib_pxi.table import * +from .__lib_pxi.tensor import * +from .__lib_pxi.types import * class MonthDayNano(NamedTuple): days: int months: int nanoseconds: int -class MonthDayNanoIntervalArray(Array[MonthDayNano, MonthDayNanoIntervalScalar]): ... - -class MonthDayNanoIntervalScalar(Scalar[MonthDayNano]): - value: MonthDayNano - -_NativeFile = TypeVar("_NativeFile", bound=NativeFile) - -class NativeFile(_Weakrefable): - _default_chunk_size: ClassVar[int] = ... - closed: bool - mode: Literal["rb", "wb", "rb+"] - def close(self) -> None: ... - def download( - self, stream_or_path: str | IOBase | NativeFile, buffer_size: int | None = ... - ) -> None: ... - def fileno(self) -> int: ... - def flush(self) -> None: ... - def get_stream(self: _NativeFile, file_offset: int, nbytes: int) -> _NativeFile: ... - def isatty(self) -> bool: ... - def metadata(self) -> dict: ... - def read(self, nbytes: int | None = ...) -> bytes: ... - def read1(self, nbytes: int | None = ...) -> bytes: ... - def read_at(self, nbytes: int, offset: int) -> bytes: ... - def read_buffer(self, nbytes: int | None = ...) -> Buffer: ... - def readable(self) -> bool: ... - def readall(self) -> bytes: ... - def readinto(self, b: Buffer | memoryview) -> int: ... - def readline(self, size: int = ...) -> bytes | None: ... - def readlines(self, hint: int = ...) -> list[bytes]: ... - def seek(self, position: int, whence: int = ...) -> None: ... - def seekable(self) -> bool: ... - def size(self) -> int: ... - def tell(self) -> int: ... - def truncate(self) -> None: ... - def upload(self, stream: IOBase | NativeFile, buffer_size: int = ...) -> None: ... - def writable(self) -> bool: ... - def write(self, data: bytes | memoryview | Buffer) -> int: ... - def writelines(self, lines: list[bytes]) -> None: ... - def __enter__(self: _NativeFile) -> _NativeFile: ... - def __exit__(self, exc_type, exc_value, tb) -> Any: ... - def __iter__(self: _NativeFile) -> _NativeFile: ... - def __next__(self) -> bytes: ... - -class NullArray(Array[None, NullScalar]): ... -class NullScalar(Scalar[None]): ... -class NumericArray(Array[_T, _Scalar]): ... -class OSFile(NativeFile): ... -class ProxyMemoryPool(MemoryPool): ... - -class PyExtensionType(ExtensionType[_T]): - def __init__(self, storage_type: DataType[_T]) -> None: ... - -class PythonFile(NativeFile): - def __init__( - self, handle: io.BytesIO, mode: Literal["rb", "wb", "rb+"] | None = ... - ) -> None: ... - -class ReadStats(importlib._bootstrap.ReadStats): ... - -class RecordBatch(_PandasConvertibleToDataFrame): - columns: list[Array] - nbytes: int - num_columns: int - num_rows: int - schema: Schema - def __init__(self, *args, **kwargs) -> None: ... - def column(self, i: int) -> Array: ... - def drop_null(self: _Self) -> _Self: ... - def equals(self, other: RecordBatch, check_metadata: bool = ...) -> bool: ... - def field(self, i: int) -> Field: ... - def filter( - self: _Self, - mask: list[bool] | BooleanArray, - null_selection_behavior: Literal["drop", "emit_null"] = ..., - ) -> _Self: ... - @staticmethod - @overload - def from_arrays( - arrays: list[Array], - *, - names: list[str], - metadata: dict | None = ..., - ) -> RecordBatch: ... - @staticmethod - @overload - def from_arrays( - arrays: list[Array], - *, - schema: list[Schema], - metadata: dict | None = ..., - ) -> RecordBatch: ... - @overload - @staticmethod - def from_pandas( - df: pd.DataFrame, - *, - preserve_index: bool | None = ..., - nthreads: int | None = ..., - ) -> RecordBatch: ... - @overload - @staticmethod - def from_pandas( - df: pd.DataFrame, - *, - schema: Schema, - preserve_index: bool | None = ..., - nthreads: int | None = ..., - ) -> RecordBatch: ... - @overload - @staticmethod - def from_pandas( - df: pd.DataFrame, - *, - columns: list[str], - preserve_index: bool | None = ..., - nthreads: int | None = ..., - ) -> RecordBatch: ... - @staticmethod - def from_pydict( - mapping: dict[str, Array | list], - schema: Schema | None = ..., - metadata: dict | None = ..., - ) -> RecordBatch: ... - @staticmethod - def from_struct_array(struct_array: StructArray) -> RecordBatch: ... - def get_total_buffer_size(self) -> int: ... - def replace_schema_metadata(self: _Self, metadata: dict | None = ...) -> _Self: ... - def serialize(self, memory_pool: MemoryPool | None = ...) -> Buffer: ... - def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... - def take( - self: _Self, - indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], - ) -> _Self: ... - def to_pydict(self) -> dict[str, list]: ... - def to_pylist(self) -> list[dict[str, Any]]: ... - def to_string(self, show_metadata: bool = ...) -> str: ... - def validate(self, *, full: bool = ...) -> None: ... - def __eq__(self, other) -> bool: ... - @overload - def __getitem__(self, key: str) -> Array: ... - @overload - def __getitem__(self: _Self, key: _builtin_slice) -> _Self: ... - def __len__(self) -> int: ... - def __sizeof__(self) -> int: ... - -class RecordBatchReader(_Weakrefable): - schema: Schema - def __init__(self, *args, **kwargs) -> None: ... - def _export_to_c(self, out_ptr: int) -> None: ... - @staticmethod - def _import_from_c(in_ptr: int) -> RecordBatchReader: ... - def close(self) -> None: ... - @staticmethod - def from_batches(schema: Schema, batches: Iterable[RecordBatch]) -> RecordBatchReader: ... - def read_all(self) -> Table: ... - def read_next_batch(self) -> RecordBatch: ... - def read_pandas(self, **options) -> pd.DataFrame: ... - def __enter__(self: _Self) -> _Self: ... - def __exit__(self, exc_type, exc_val, exc_tb) -> None: ... - def __iter__(self) -> Generator[RecordBatch, None, None]: ... - -class ResizableBuffer(Buffer): - def resize(self, new_size: int, shrink_to_fit: bool = ...) -> None: ... - -class RuntimeInfo(NamedTuple): - detected_simd_level: str - simd_level: str - -class Scalar(_Weakrefable, Generic[_T]): - is_valid: bool - type: DataType[_T] - def __init__(self) -> None: ... - def as_py(self) -> _T: ... - @overload - def cast(self, target_type: Literal["bool", "boolean"]) -> BooleanScalar: ... - @overload - def cast(self, target_type: Literal["i1", "int8"]) -> Int8Scalar: ... - @overload - def cast(self, target_type: Literal["i2", "int16"]) -> Int16Scalar: ... - @overload - def cast(self, target_type: Literal["i4", "int32"]) -> Int32Scalar: ... - @overload - def cast(self, target_type: Literal["i8", "int64"]) -> Int64Scalar: ... - @overload - def cast(self, target_type: Literal["u1", "uint8"]) -> UInt8Scalar: ... - @overload - def cast(self, target_type: Literal["u2", "uint16"]) -> UInt16Scalar: ... - @overload - def cast(self, target_type: Literal["u4", "uint32"]) -> UInt32Scalar: ... - @overload - def cast(self, target_type: Literal["u8", "uint64"]) -> UInt64Scalar: ... - @overload - def cast(self, target_type: Literal["f2", "halffloat", "float16"]) -> HalfFloatScalar: ... - @overload - def cast(self, target_type: Literal["f4", "float", "float32"]) -> FloatScalar: ... - @overload - def cast(self, target_type: Literal["f8", "double", "float64"]) -> DoubleScalar: ... - @overload - def cast(self, target_type: Literal["string", "str", "utf8"]) -> StringScalar: ... - @overload - def cast(self, target_type: Literal["binary"]) -> BinaryScalar: ... - @overload - def cast( - self, target_type: Literal["large_string", "large_str", "large_utf8"] - ) -> LargeStringScalar: ... - @overload - def cast(self, target_type: Literal["large_binary"]) -> LargeBinaryScalar: ... - @overload - def cast(self, target_type: Literal["date32", "date32[day]"]) -> Date32Scalar: ... - @overload - def cast(self, target_type: Literal["date64", "date64[ms]"]) -> Date64Scalar: ... - @overload - def cast(self, target_type: Literal["time32[s]", "time32[ms]"]) -> Time32Scalar: ... - @overload - def cast(self, target_type: Literal["time64[us]", "time64[ns]"]) -> Time64Scalar: ... - @overload - def cast( - self, - target_type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"], - ) -> TimestampScalar: ... - @overload - def cast( - self, - target_type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"], - ) -> DurationScalar: ... - @overload - def cast( - self, - target_type: Literal["month_day_nano_interval"], - ) -> MonthDayNanoIntervalScalar: ... - @overload - def cast(self, target_type: DataType) -> Scalar: ... - def equals(self, other: Scalar) -> bool: ... - def __eq__(self, other) -> bool: ... - -class Schema(_Weakrefable): - metadata: dict[bytes, bytes] | None - names: list[str] - pandas_metadata: dict[str, Any] | None - types: list[DataType] - def _export_to_c(self, out_ptr: int) -> None: ... - def _field(self, i: int) -> Field: ... - @staticmethod - def _import_from_c(in_ptr: int) -> Schema: ... - def add_metadata(self: _Self, metadata: dict[str | bytes, str | bytes]) -> _Self: ... - def append(self: _Self, field: Field) -> _Self: ... - def empty_table(self: _Self) -> _Self: ... - def equals(self, other: Schema, check_metadata: bool = ...) -> bool: ... - def field(self, i: int) -> Field: ... - def field_by_name(self, name: str) -> Field | None: ... - @classmethod - def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = ...) -> Schema: ... - def get_all_field_indices(self, name: str) -> list[int]: ... - def get_field_index(self, name: str) -> int: ... - def insert(self: _Self, i: int, field: Field) -> _Self: ... - def remove(self: _Self, i: int) -> _Self: ... - def remove_metadata(self: _Self) -> _Self: ... - def serialize(self, memory_pool: MemoryPool | None = ...) -> Buffer: ... - def set(self: _Self, i: int, field: Field) -> _Self: ... - def to_string( - self, - truncate_metadata: bool = ..., - show_field_metadata: bool = ..., - show_schema_metadata: bool = ..., - ) -> str: ... - def with_metadata(self: _Self, metadata: dict[str | bytes, str | bytes]) -> _Self: ... - def __eq__(self, other) -> bool: ... - def __getitem__(self, key: int) -> Field: ... - def __iter__(self) -> Generator[Field, None, None]: ... - def __len__(self) -> int: ... - def __sizeof__(self) -> int: ... - -class SerializationCallbackError(ArrowSerializationError): - def __init__(self, message: str, example_object) -> None: ... - -class SerializationContext(_Weakrefable): - def _deserialize_callback(self, serialized_obj: dict) -> Any: ... - def _serialize_callback(self, obj: Any) -> dict: ... - def clone(self: _Self) -> _Self: ... - def deserialize(self, what) -> Any: ... - def deserialize_components(self, what) -> Any: ... - def register_type( - self, - type_: type, - type_id: str, - pickle: bool = ..., - custom_serializer: Callable[[Any], bytes] | None = ..., - custom_deserializer: Callable[[bytes], Any] | None = ..., - ) -> Any: ... - def serialize(self, obj: Any) -> Any: ... - def serialize_to(self, value, sink) -> Any: ... - def set_pickle( - self, serializer: Callable[[Any], bytes], deserializer: Callable[[bytes], Any] - ) -> None: ... - -class SerializedPyObject(_Weakrefable): - base: Any - total_bytes: int - - def deserialize(self, context: SerializationContext | None = ...) -> Any: ... - @staticmethod - def from_components(components: dict[str, Any]) -> SerializedPyObject: ... - def to_buffer(self, nthreads: int = ...) -> Buffer: ... - def to_components(self, memory_pool: MemoryPool | None = ...) -> dict[str, Any]: ... - def write_to(self, sink) -> Any: ... - -class SignalStopHandler: - stop_token: StopToken - def _init_signals(self) -> Any: ... - def __enter__(self: _Self) -> _Self: ... - def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... - -class SparseCOOTensor(_Weakrefable, Generic[_T]): - dim_names: tuple[str, ...] - has_canonical_format: bool - is_mutable: bool - ndim: int - non_zero_length: int - shape: tuple[int, ...] - size: int - type: DataType[_T] - def dim_name(self, i: int) -> str: ... - def equals(self, other: SparseCOOTensor) -> bool: ... - @classmethod - def from_dense_numpy( - cls, obj: NDArray, dim_names: list[str] | None = ... - ) -> SparseCOOTensor: ... - @staticmethod - def from_numpy( - data: NDArray, coords: NDArray, shape: tuple, dim_names: list[str] | None = ... - ) -> SparseCOOTensor: ... - @staticmethod - def from_pydata_sparse(obj, dim_names: list[str] | None = ...) -> SparseCOOTensor: ... - @staticmethod - def from_scipy(obj, dim_names: list[str] | None = ...) -> SparseCOOTensor: ... - @staticmethod - def from_tensor(self, obj: Tensor[_T]) -> SparseCOOTensor[_T]: ... - def to_numpy(self) -> NDArray: ... - def to_pydata_sparse(self) -> Any: ... - def to_scipy(self) -> Any: ... - def to_tensor(self) -> Tensor[_T]: ... - def __eq__(self, other) -> bool: ... - -class SparseCSCMatrix(_Weakrefable, Generic[_T]): - dim_names: tuple[str, ...] - is_mutable: bool - ndim: int - non_zero_length: int - shape: tuple[int, ...] - size: int - type: DataType[_T] - def dim_name(self, i: int) -> str: ... - def equals(self, other: SparseCSCMatrix) -> bool: ... - @classmethod - def from_dense_numpy( - cls, obj: NDArray, dim_names: list[str] | None = ... - ) -> SparseCSCMatrix: ... - @staticmethod - def from_numpy( - data: NDArray, - indptr: NDArray, - indices: NDArray, - shape: tuple[int, ...], - dim_names: list[str] | None = ..., - ) -> SparseCSCMatrix: ... - def from_scipy(self, obj, dim_names: list[str] | None = ...) -> SparseCSCMatrix: ... - def from_tensor(self, obj: Tensor[_T]) -> SparseCSCMatrix[_T]: ... - def to_numpy(self) -> NDArray: ... - def to_scipy(self) -> Any: ... - def to_tensor(self) -> Tensor[_T]: ... - def __eq__(self, other) -> bool: ... - -class SparseCSFTensor(_Weakrefable, Generic[_T]): - dim_names: tuple[str, ...] - is_mutable: bool - ndim: int - non_zero_length: int - shape: tuple[int, ...] - size: int - type: DataType[_T] - def dim_name(self, i: int) -> str: ... - def equals(self, other: SparseCSFTensor) -> bool: ... - @staticmethod - def from_dense_numpy(obj: NDArray, dim_names: list[str] | None = ...) -> SparseCSFTensor: ... - @staticmethod - def from_numpy( - data: NDArray, - indptr: NDArray, - indices: NDArray, - shape: tuple[int, ...], - axis_order: list[str] | None = ..., - dim_names=..., - ) -> SparseCSFTensor: ... - @staticmethod - def from_tensor(obj: Tensor[_T]) -> SparseCSFTensor[_T]: ... - def to_numpy(self) -> NDArray: ... - def to_tensor(self) -> Tensor[_T]: ... - def __eq__(self, other) -> bool: ... - -class SparseCSRMatrix(_Weakrefable, Generic[_T]): - dim_names: tuple[str, ...] - is_mutable: bool - ndim: int - non_zero_length: int - shape: tuple[int, ...] - size: int - type: DataType[_T] - def dim_name(self, i: int) -> str: ... - def equals(self, other: SparseCSRMatrix) -> bool: ... - @classmethod - def from_dense_numpy( - cls, obj: NDArray, dim_names: list[str] | None = ... - ) -> SparseCSRMatrix: ... - @staticmethod - def from_numpy( - data: NDArray, - indptr: NDArray, - indices: NDArray, - shape: tuple[int, ...], - dim_names: list[str] | None = ..., - ) -> SparseCSRMatrix: ... - def from_scipy(self, obj, dim_names: list[str] | None = ...) -> SparseCSRMatrix: ... - def from_tensor(self, obj: Tensor[_T]) -> SparseCSRMatrix[_T]: ... - def to_numpy(self) -> NDArray: ... - def to_scipy(self) -> Any: ... - def to_tensor(self) -> Tensor[_T]: ... - def __eq__(self, other) -> bool: ... - -class SparseUnionType(UnionType): ... -class StopToken: ... - -class StringArray(Array[str, StringScalar]): - @staticmethod - def from_buffers( # type: ignore - length: int, - value_offsets: Buffer, - data: Buffer, - null_bitmap: Buffer | None = ..., - null_count: int = ..., - offset: int = ..., - ) -> StringArray: ... - -class StringBuilder(_Weakrefable): - null_count: int - def __init__(self, memory_pool: MemoryPool | None = ...) -> None: ... - def append(self, value: str | bytes) -> None: ... - def append_values(self, values: list[str | bytes]) -> None: ... - def finish(self) -> StringArray: ... - def __len__(self) -> int: ... - -class StringScalar(BinaryScalar[str]): ... - -class StructArray(Array[dict, StructScalar]): - def field(self, index: int | str) -> Int64Array: ... - def flatten(self, memory_pool: MemoryPool | None = ...) -> list[Array]: ... - @staticmethod - def from_arrays( - arrays: Array, - names: list[str] | None = ..., - fields: list[Field] | None = ..., - mask: BooleanArray | None = ..., - memory_pool: MemoryPool | None = ..., - ) -> StructArray: ... - -class StructScalar(Scalar, collections.abc.Mapping): - def _as_py_tuple(self) -> Any: ... - def as_py(self) -> dict: ... - def items(self) -> ItemsView[str, Any]: ... - def __contains__(self, other) -> bool: ... - def __getitem__(self, index) -> Scalar: ... - def __iter__(self) -> Generator[str, None, None]: ... - def __len__(self) -> int: ... - -class StructType(DataType): - def field(self, i: int) -> Field: ... - def get_all_field_indices(self, name: str) -> list[int]: ... - def get_field_index(self, name: str) -> int: ... - def __getitem__(self, index) -> Field: ... - def __iter__(self) -> Generator[Field, None, None]: ... - def __len__(self) -> int: ... - -class Table(_PandasConvertibleToDataFrame): - column_names: list[str] - columns: list[Array] - nbytes: int - num_columns: int - num_rows: int - schema: Schema - shape: tuple[int, ...] - def _column(self, i: int) -> Any: ... - def _ensure_integer_index(self, i) -> Any: ... - def _to_pandas( - self, options, categories=..., ignore_metadata=..., types_mapper=... - ) -> Any: ... - def add_column(self: _Self, i: int, field_: str | Field, column: Array) -> _Self: ... - def append_column(self: _Self, field_: str | Field, column: Array) -> _Self: ... - def cast( - self, - target_schema: Schema, - safe: bool | None = ..., - options: CastOptions | None = ..., - ) -> Table: ... - def column(self, i: int | str) -> ChunkedArray: ... - def combine_chunks(self: _Self, memory_pool: MemoryPool | None = ...) -> _Self: ... - def drop_null(self: _Self) -> _Self: ... - def equals(self, other: Table, check_metadata: bool = ...) -> bool: ... - def field(self, i: int) -> Field: ... - def filter( - self: _Self, - mask: list[bool] | BooleanArray, - null_selection_behavior: Literal["drop", "emit_null"] = ..., - ) -> _Self: ... - def flatten(self, memory_pool: MemoryPool | None = ...) -> Table: ... - @staticmethod - def from_arrays( - arrays: list[Array], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: dict[str | bytes, str | bytes] | None = None, - ) -> Table: ... - @staticmethod - def from_batches( - batches: collections.abc.Iterable[RecordBatch], schema: Schema | None = None - ) -> Table: ... - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = ..., - preserve_index: bool | None = ..., - nthreads: int | None = ..., - columns: list[str] | None = ..., - safe: bool = ..., - ) -> Table: ... - @staticmethod - def from_pydict( - mapping: dict, - schema: Schema | None = ..., - metadata: dict[str | bytes, str | bytes] | None = ..., - ) -> Table: ... - @staticmethod - def from_pylist( - mapping: list[dict], - schema: Schema | None = ..., - metadata: dict[str | bytes, str | bytes] | None = ..., - ) -> Table: ... - def get_total_buffer_size(self) -> int: ... - def group_by(self, keys: list[str]) -> TableGroupBy: ... - def itercolumns(self) -> Generator[ChunkedArray, None, None]: ... - def join( - self, - right_table: Table, - keys: str | list[str], - right_keys: str | list[str] | None = ..., - join_type: Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", - ] = ..., - left_suffix: str | None = ..., - right_suffix: str | None = ..., - coalesce_keys: bool = ..., - use_threads: bool = ..., - ) -> Table: ... - def remove_column(self: _Self, i: int) -> _Self: ... - def replace_schema_metadata( - self: _Self, metadata: dict[str | bytes, str | bytes] | None = ... - ) -> _Self: ... - def select(self, columns: list[str]) -> Table: ... - def set_column(self: _Self, i: int, field_: str | Field, column: Array) -> _Self: ... - def slice(self: _Self, offset: int = ..., length: int | None = ...) -> _Self: ... - def sort_by( - self, - sorting: Literal["ascending", "descending"] - | list[tuple[str, Literal["ascending", "descending"]]], - ) -> Table: ... - def take( - self: _Self, - indices: list[int] | IntegerArray | NDArray[np.signedinteger | np.unsignedinteger], - ) -> _Self: ... - def to_batches(self, max_chunksize: int | None = ...) -> list[RecordBatch]: ... - def to_pylist(self) -> list[dict]: ... - def to_reader(self, max_chunksize: int | None = ...) -> RecordBatchReader: ... - def to_string(self, *, show_metadata: bool = ..., preview_cols: int = ...) -> str: ... - def unify_dictionaries(self: _Self, memory_pool: MemoryPool | None = ...) -> _Self: ... - def validate(self, *, full: bool = ...) -> None: ... - def __eq__(self, other) -> bool: ... - @overload - def __getitem__(self, index: int | str) -> ChunkedArray: ... - @overload - def __getitem__(self, index: _builtin_slice) -> Table: ... - def __len__(self) -> int: ... - def __sizeof__(self) -> int: ... - -class TableGroupBy: - def __init__(self, table: Table, keys: str | list[str]) -> None: ... - def aggregate( - self, aggregations: list[tuple[str, str] | tuple[str, str, FunctionOptions]] - ) -> Table: ... - -class Tensor(_Weakrefable, Generic[_T]): - dim_names: list[str] - is_contiguous: bool - is_mutable: bool - ndim: int - shape: tuple[int, ...] - size: int - strides: tuple[int, ...] - type: DataType[_T] - def dim_name(self, i: int) -> str: ... - def equals(self, other: Tensor) -> bool: ... - @staticmethod - def from_numpy(obj: NDArray, dim_names: list[str] | None = ...) -> Tensor: ... - def to_numpy(self) -> NDArray: ... - def __eq__(self, other) -> bool: ... - -class TextIOBase(_io._TextIOBase, io.IOBase): ... -class Time32Array(NumericArray[dt.time, Time32Scalar]): ... -class Time32Scalar(Scalar[dt.time]): ... - -class Time32Type(DataType[dt.time]): - unit: str - -class Time64Array(NumericArray[dt.time, Time64Scalar]): ... -class Time64Scalar(Scalar[dt.time]): ... - -class Time64Type(DataType[dt.time]): - unit: Any - -class TimestampArray(NumericArray[dt.datetime, TimestampScalar]): ... - -class TimestampScalar(Scalar[dt.datetime]): - value: int - -class TimestampType(DataType[dt.datetime]): - tz: Any - unit: str - def to_pandas_dtype(self) -> DTypeLike: ... - -class Transcoder: - def __init__(self, decoder, encoder) -> None: ... - def __call__(self, buf) -> Any: ... - -class TransformInputStream(NativeFile): ... -class UInt16Array(IntegerArray[UInt16Scalar]): ... -class UInt16Scalar(Scalar[int]): ... -class UInt32Array(IntegerArray[UInt32Scalar]): ... -class UInt32Scalar(Scalar[int]): ... -class UInt64Array(IntegerArray[UInt64Scalar]): ... -class UInt64Scalar(Scalar[int]): ... -class UInt8Array(IntegerArray[UInt8Scalar]): ... -class UInt8Scalar(Scalar[int]): ... - -class UnionArray(Array[Any, UnionScalar]): - offsets: Int32Array - type_codes: Int8Array - def child(self, pos: int) -> Array: ... - def field(self, pos: int) -> Array: ... - @staticmethod - def from_dense( - types: Int8Array, - value_offsets: Int32Array, - children: list, - field_names: list[str] | None = ..., - type_codes: list | None = ..., - ) -> UnionArray: ... - @staticmethod - def from_sparse( - types: Int8Array, - children: list, - field_names: list[str] | None = ..., - type_codes: list | None = ..., - ) -> UnionArray: ... - -class UnionScalar(Scalar): - type_code: Any - value: Any - -class UnionType(DataType): - mode: Any - type_codes: Any - def field(self, i) -> Field: ... - def __getitem__(self, index) -> Any: ... - def __iter__(self) -> Any: ... - def __len__(self) -> int: ... - -class UnknownExtensionType(PyExtensionType): - def __arrow_ext_serialize__(self) -> Any: ... - -class UnsupportedOperation(OSError, ValueError): ... - -class VersionInfo(NamedTuple): - major: str - minor: str - patch: str - -class WriteStats(importlib._bootstrap.WriteStats): - __slots__: ClassVar[tuple] = ... - -class _CRecordBatchWriter(_Weakrefable): - stats: Any - def close(self) -> None: ... - def write(self, table_or_batch: RecordBatch | Table) -> None: ... - def write_batch(self, batch: RecordBatch) -> None: ... - def write_table(self, table: Table, max_chunksize: int | None = ...) -> None: ... - def __enter__(self) -> _CRecordBatchWriter: ... - def __exit__(self, exc_type, exc_val, exc_tb) -> Any: ... - -class _ExtensionRegistryNanny(_Weakrefable): - def release_registry(self) -> None: ... - -class _Metadata(_Weakrefable): ... - -class _PandasAPIShim: - _array_like_types: Any - _categorical_type: Any - _compat_module: Any - _data_frame: Any - _datetimetz_type: Any - _extension_array: Any - _extension_dtype: Any - _index: Any - _is_extension_array_dtype: Any - _loose_version: Any - _pd: Any - _pd024: Any - _series: Any - _types_api: Any - _version: Any - categorical_type: Any - compat: Any - datetimetz_type: Any - extension_dtype: Any - has_sparse: Any - have_pandas: Any - loose_version: Any - pd: Any - version: Any - def __init__(self) -> None: ... - def assert_frame_equal(self, *args, **kwargs) -> Any: ... - def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... - def get_rangeindex_attribute(self, level, name) -> Any: ... - def get_values(self, obj) -> Any: ... - def infer_dtype(self, obj) -> Any: ... - def is_array_like(self, obj) -> bool: ... - def is_categorical(self, obj) -> TypeGuard[pd.Categorical]: ... - def is_data_frame(self, obj) -> TypeGuard[pd.DataFrame]: ... - def is_datetimetz(self, obj) -> bool: ... - def is_extension_array_dtype(self, obj) -> bool: ... - def is_index(self, obj) -> TypeGuard[pd.Index]: ... - def is_series(self, obj) -> TypeGuard[pd.Series]: ... - def is_sparse(self, obj) -> bool: ... - def pandas_dtype(self, dtype: DTypeLike) -> DTypeLike: ... - def series(self, *args, **kwargs) -> pd.Series: ... - -class _PandasConvertibleToDataFrame(_Weakrefable): - def to_pandas( - self, - memory_pool: MemoryPool | None = ..., - categories: list[pd.Categorical] | None = ..., - strings_to_categorical: bool | None = ..., - zero_copy_only: bool | None = ..., - integer_object_nulls: bool | None = ..., - date_as_object: bool | None = ..., - timestamp_as_object: bool | None = ..., - use_threads: bool | None = ..., - deduplicate_objects: bool | None = ..., - ignore_metadata: bool | None = ..., - safe: bool | None = ..., - split_blocks: bool | None = ..., - self_destruct: bool | None = ..., - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] | None = ..., - ) -> pd.DataFrame: ... - -class _PandasConvertibleToSeries(_Weakrefable): - def to_pandas( - self, - memory_pool: MemoryPool | None = ..., - categories: list[pd.Categorical] | None = ..., - strings_to_categorical: bool | None = ..., - zero_copy_only: bool | None = ..., - integer_object_nulls: bool | None = ..., - date_as_object: bool | None = ..., - timestamp_as_object: bool | None = ..., - use_threads: bool | None = ..., - deduplicate_objects: bool | None = ..., - ignore_metadata: bool | None = ..., - safe: bool | None = ..., - split_blocks: bool | None = ..., - self_destruct: bool | None = ..., - types_mapper: Callable[[DataType], pd.api.extensions.ExtensionDtype] | None = ..., - ) -> pd.Series: ... - -class _ReadPandasMixin: - def read_pandas(self, **options) -> Any: ... - -class _ReadStats(NamedTuple): - num_dictionary_batches: int - num_dictionary_deltas: int - num_messages: int - num_record_batches: int - num_replaced_dictionaries: int - -class _RecordBatchFileReader(_Weakrefable): - num_record_batches: Any - schema: Any - stats: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _open( - self, - source, - footer_offset=..., - IpcReadOptionsoptions=..., - MemoryPoolmemory_pool=..., - ) -> Any: ... - def get_batch(self, inti) -> Any: ... - def get_record_batch(self, *args, **kwargs) -> Any: ... - def read_all(self) -> Any: ... - def read_pandas(self, **options) -> Any: ... - def __enter__(self) -> Any: ... - def __exit__(self, exc_type, exc_value, traceback) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RecordBatchFileWriter(_RecordBatchStreamWriter): - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _RecordBatchStreamReader(RecordBatchReader): - stats: Any - def _open(self, source, IpcReadOptionsoptions=..., MemoryPoolmemory_pool=...) -> Any: ... - def __next__(self) -> RecordBatch: ... - -class _RecordBatchStreamWriter(_CRecordBatchWriter): - _metadata_version: Any - _use_legacy_format: Any - @classmethod - def __init__(self, *args, **kwargs) -> None: ... - def _open(self, sink, Schemaschema, IpcWriteOptionsoptions=...) -> Any: ... - def __reduce__(self) -> Any: ... - def __setstate__(self, state) -> Any: ... - -class _Weakrefable: ... - -class _WriteStats(NamedTuple): - num_dictionary_batches: int - num_dictionary_deltas: int - num_messages: int - num_record_batches: int - num_replaced_dictionaries: int - -class ordered_dict: - def __init__(self, *args, **kwargs) -> None: ... - def clear(self, *args, **kwargs) -> Any: ... - def copy(self) -> dict: ... - @classmethod - def fromkeys(cls, *args, **kwargs) -> Any: ... - def get(self, *args, **kwargs) -> Any: ... - def items(self, *args, **kwargs) -> Any: ... - def keys(self, *args, **kwargs) -> Any: ... - def pop(self, *args, **kwargs) -> Any: ... - def popitem(self, *args, **kwargs) -> Any: ... - def setdefault(self, *args, **kwargs) -> Any: ... - def update(self, *args, **kwargs) -> Any: ... - def values(self, *args, **kwargs) -> Any: ... - @classmethod - def __class_getitem__(cls, *args, **kwargs) -> Any: ... - def __contains__(self, other) -> Any: ... - def __delitem__(self, other) -> Any: ... - def __eq__(self, other) -> Any: ... - def __ge__(self, other) -> Any: ... - def __getitem__(self, y) -> Any: ... - def __gt__(self, other) -> Any: ... - def __ior__(self, other) -> Any: ... - def __iter__(self) -> Any: ... - def __le__(self, other) -> Any: ... - def __len__(self) -> Any: ... - def __lt__(self, other) -> Any: ... - def __ne__(self, other) -> Any: ... - def __or__(self, other) -> Any: ... - def __reversed__(self) -> Any: ... - def __ror__(self, other) -> Any: ... - def __setitem__(self, index, object) -> Any: ... - def __sizeof__(self) -> Any: ... - -def __pyx_unpickle_SerializationContext(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle__PandasAPIShim(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle__PandasConvertible(__pyx_type, long__pyx_checksum, __pyx_state) -> Any: ... -def __pyx_unpickle___Pyx_EnumMeta(*args, **kwargs) -> Any: ... -def _datetime_from_int(int64_tvalue, TimeUnitunit, tzinfo=...) -> Any: ... -def _deprecate_serialization(name) -> Any: ... -def _deserialize(obj, SerializationContextcontext=...) -> Any: ... -def _detect_compression(path) -> Any: ... -def _empty_array(DataTypetype) -> Any: ... -def _from_pydict(cls, mapping, schema, metadata) -> Any: ... -def _from_pylist(cls, mapping, schema, metadata) -> Any: ... -def _gdb_test_session() -> Any: ... -def _get_default_context() -> Any: ... -def _handle_arrow_array_protocol(obj, type, mask, size) -> Any: ... -def _is_primitive(Typetype) -> Any: ... -def _ndarray_to_arrow_type(values, DataTypetype) -> Any: ... -def _normalize_slice(arrow_obj, slicekey) -> Any: ... -def _pc() -> Any: ... -def _read_serialized(source, base=...) -> Any: ... -def _reconstruct_record_batch(columns, schema) -> Any: ... -def _reconstruct_table(arrays, schema) -> Any: ... -def _register_py_extension_type() -> Any: ... -def _restore_array(data) -> Any: ... -def _serialize(value, SerializationContextcontext=...) -> Any: ... -def _unregister_py_extension_types() -> Any: ... -@overload -def allocate_buffer( - size: int, - memory_pool: MemoryPool | None = ..., -) -> Buffer: ... -@overload -def allocate_buffer( - size: int, memory_pool: MemoryPool | None = ..., *, resizable: Literal[True] -) -> ResizableBuffer: ... -@overload -def allocate_buffer( - size: int, memory_pool: MemoryPool | None = ..., *, resizable: Literal[False] -) -> Buffer: ... -def array( - obj: Iterable | NDArray | pd.Series, - type: DataType | None = ..., - mask: list[bool] | BooleanArray | None = ..., - size: int | None = ..., - from_pandas: bool | None = ..., - safe: bool = ..., - memory_pool: MemoryPool | None = ..., -) -> Array | ChunkedArray: ... -def as_buffer(o) -> Buffer: ... -def asarray(values: Iterable, type: DataType | None = ...) -> Array: ... -def benchmark_PandasObjectIsNull(obj: list) -> Any: ... -def binary(length: int = ...) -> DataType[bytes]: ... -def bool_() -> DataType[bool]: ... -@overload -def chunked_array(arrays: Array[_T, _Scalar]) -> ChunkedArray[_T, _Scalar]: ... -@overload -def chunked_array(arrays: Array, type: DataType[_T]) -> ChunkedArray[_T, Scalar[_T]]: ... -@overload -def compress( - buf: Buffer | bytes | memoryview, - codec: str = ..., - *, - memory_pool: MemoryPool | None = ..., -) -> Buffer: ... -@overload -def compress( - buf: Buffer | bytes | memoryview, - codec: str = ..., - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = ..., -) -> bytes: ... -def concat_arrays(arrays: list[_Array], memory_pool: MemoryPool | None = ...) -> _Array: ... -def concat_tables( - tables: list[Table], promote: bool = ..., memory_pool: MemoryPool | None = ... -) -> Table: ... def cpu_count() -> int: ... -def create_memory_map(path: str | PathLike, size: int) -> MemoryMappedFile: ... -def date32() -> DataType[dt.date]: ... -def date64() -> DataType[dt.date]: ... -def decimal128(precision: int, scale: int | None = ...) -> DataType[Decimal]: ... -def decimal256(precision: int, scale: int | None = ...) -> DataType[Decimal]: ... -@overload -def decompress( - buf: Buffer | bytes | memoryview, - decompressed_size: int | None = ..., - codec: str = ..., - *, - memory_pool: MemoryPool | None = ..., -) -> Buffer: ... -@overload -def decompress( - buf: Buffer | bytes | memoryview, - decompressed_size: int | None = ..., - codec: str = ..., - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = ..., -) -> bytes: ... -def default_memory_pool() -> MemoryPool: ... -def dense_union( - child_fields: list[Field], type_codes: list[int] | None = ... -) -> DenseUnionType: ... -def deserialize(obj, context: SerializationContext = ...) -> object: ... -def deserialize_components(components: dict, context: SerializationContext = ...) -> object: ... -def deserialize_from( - source: NativeFile, base: object, context: SerializationContext = ... -) -> object: ... -def dictionary( - index_type: DataType, value_type: DataType, ordered: bool = ... -) -> DictionaryType: ... -def duration(unit: Literal["s", "ms", "us", "ns"]) -> DurationType: ... -def enable_signal_handlers(enable: bool) -> None: ... -def encode_file_path(path: str) -> bytes: ... -def ensure_metadata(meta: dict, allow_none: bool = ...) -> KeyValueMetadata: ... -def ensure_type(ty: DataType, allow_none=...) -> DataType: ... -def field( - name: str | bytes, - type: DataType[_T], - nullable: bool = ..., - metadata: dict | None = ..., -) -> Field[_T]: ... -def float16() -> DataType[float]: ... -def float32() -> DataType[float]: ... -def float64() -> DataType[float]: ... -def foreign_buffer(address: int, size: int, base: object | None = ...) -> Buffer: ... -def from_numpy_dtype(dtype: DTypeLike) -> DataType: ... -def frombytes(o: bytes, *, safe: bool = ...) -> str: ... -def get_record_batch_size(batch: RecordBatch) -> int: ... -def get_tensor_size(tensor: Tensor) -> int: ... -def infer_type( - values: Iterable, mask: list[bool] | BooleanArray = ..., from_pandas: bool = ... -) -> DataType: ... -def input_stream( - source: str | PathLike | Buffer | IOBase | NativeFile, - compression: str | None = ..., - buffer_size: int | None = ..., -) -> NativeFile: ... -def int16() -> DataType[int]: ... -def int32() -> DataType[int]: ... -def int64() -> DataType[int]: ... -def int8() -> DataType[int]: ... -def io_thread_count() -> int: ... -def is_boolean_value(obj: Any) -> bool: ... -def is_float_value(obj: Any) -> bool: ... -def is_integer_value(obj: Any) -> bool: ... -def is_named_tuple(cls: Any) -> bool: ... -def jemalloc_memory_pool() -> Any: ... -def jemalloc_set_decay_ms(decay_ms: int) -> None: ... -def large_binary() -> DataType[bytes]: ... -def large_list(value_type: DataType[_T] | Field[_T]) -> LargeListType[_T]: ... -def large_string() -> DataType[str]: ... -def large_utf8() -> DataType[str]: ... -def list_(value_type: DataType[_T] | Field[_T], list_size: int = ...) -> ListType[_T]: ... -def log_memory_allocations(enable: bool = ...) -> None: ... -def logging_memory_pool(parent: MemoryPool) -> MemoryPool: ... -def map_( - key_type: DataType[_Key], item_type: DataType[_Item], keys_sorted: bool = ... -) -> MapType[_Key, _Item]: ... -def memory_map(path: str, mode: Literal["r", "r+", "w"] = ...) -> MemoryMappedFile: ... -def mimalloc_memory_pool() -> MemoryPool: ... -def month_day_nano_interval() -> DataType[MonthDayNano]: ... -def null() -> DataType[None]: ... -def nulls( - size: int, type: DataType[_T] = ..., memory_pool: MemoryPool | None = ... -) -> Array[_T, Scalar[_T]]: ... -def output_stream( - source: str | PathLike | Buffer | IOBase | memoryview | NativeFile, - compression: str | None = ..., - buffer_size: int = ..., -) -> NativeFile: ... -def proxy_memory_pool(parent: MemoryPool) -> MemoryPool: ... -def py_buffer(obj: bytes | memoryview) -> Buffer: ... -def read_message( - source: NativeFile | IOBase | memoryview | Buffer, -) -> Message: ... -def read_record_batch( - obj: Message | Buffer | memoryview, - schema: Schema, - dictionary_memo: DictionaryMemo | None = ..., -) -> RecordBatch: ... -def read_schema( - obj: Buffer | Message | memoryview, dictionary_memo: DictionaryMemo | None = ... -) -> Schema: ... -def read_serialized(source: NativeFile, base: object | None = ...) -> object: ... -def read_tensor(source: NativeFile) -> Tensor: ... -@overload -def record_batch( - data: pd.DataFrame, - schema: Schema | None = ..., - metadata: dict | None = ..., -) -> RecordBatch: ... -@overload -def record_batch( - data: list[Array | ChunkedArray], - names: list[str], - metadata: dict | None = ..., -) -> RecordBatch: ... -@overload -def record_batch( - data: list[Array | ChunkedArray], - schema: Schema, - metadata: dict | None = ..., -) -> RecordBatch: ... -def register_extension_type(ext_type: BaseExtensionType) -> None: ... -def repeat(value, size: int, memory_pool: MemoryPool | None = ...) -> Array: ... -def runtime_info() -> RuntimeInfo: ... -def scalar( - value: Any, - type: DataType[_T], - *, - from_pandas: bool | None = ..., - memory_pool: MemoryPool | None = ..., -) -> Scalar[_T]: ... -def schema(fields: Iterable[Field], metadata: dict | None = ...) -> Schema: ... -def serialize(value: object, context: SerializationContext | None = ...) -> object: ... -def serialize_to( - value: object, sink: NativeFile | IOBase, context: SerializationContext | None = ... -) -> None: ... def set_cpu_count(count: int) -> None: ... -def set_io_thread_count(count: int) -> None: ... -def set_memory_pool(pool: MemoryPool) -> None: ... -def sparse_union( - child_fields: Iterable[Field], type_codes: list[int] = ... -) -> SparseUnionType: ... -def string() -> DataType[str]: ... -def string_to_tzinfo(name: str) -> dt.tzinfo: ... -def struct(fields: Iterable[Field]) -> StructType: ... -def supported_memory_backends() -> list[str]: ... -def system_memory_pool() -> MemoryPool: ... -@overload -def table(df: pd.DataFrame, schema: Schema | None = ..., nthreads: int | None = ...) -> Table: ... -@overload -def table(data: RecordBatch, schema: Schema | None = ..., nthreads: int | None = ...) -> Table: ... -@overload -def table( - arrays: list[Array], - schema: Schema, - metadata: dict | None = ..., - nthreads: int | None = ..., -) -> Table: ... -@overload -def table( - arrays: list[Array], - names: list[str], - metadata: dict | None = ..., - nthreads: int | None = ..., -) -> Table: ... -def table_to_blocks( - options: dict, table: Table, categories: list[str], extension_columns: list[str] -) -> list[dict]: ... -def time32(unit: Literal["s", "ms"]) -> DataType[dt.time]: ... -def time64(unit: Literal["us", "ns"]) -> DataType[dt.time]: ... -def timestamp(unit, tz=...) -> Any: ... -def tobytes(o: str | bytes) -> bytes: ... -def total_allocated_bytes() -> int: ... -def transcoding_input_stream(stream, src_encoding, dest_encoding) -> Any: ... -def type_for_alias(name: str) -> DataType: ... -def tzinfo_to_string(tz: dt.tzinfo) -> str: ... -def uint16() -> DataType[int]: ... -def uint32() -> DataType[int]: ... -def uint64() -> DataType[int]: ... -def uint8() -> DataType[int]: ... -def unify_schemas(schemas: list[Schema]) -> Schema: ... -def union( - child_fields: Iterable[Field], - mode: Literal["sparse", "dense"], - type_codes: list[int] | None = ..., -) -> UnionType: ... -def unregister_extension_type(type_name: str) -> None: ... -def utf8() -> DataType[str]: ... -def write_tensor(tensor: Tensor, dest: NativeFile) -> None: ... +def is_threading_enabled() -> bool: ... + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index e205c825b03..1b2d277214d 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,18 +1,12 @@ -from io import IOBase - -from pyarrow._orc import ORCReader as _ORCReader -from pyarrow._orc import ORCWriter as _ORCWriter -from pyarrow.lib import KeyValueMetadata -from pyarrow.lib import NativeFile -from pyarrow.lib import RecordBatch -from pyarrow.lib import Schema -from pyarrow.lib import Table +from typing import IO, Literal, Self +from . import _orc from ._fs import FileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: - reader: _ORCReader - def __init__(self, source: str | NativeFile | IOBase) -> None: ... + reader: _orc.ORCReader + def __init__(self, source: str | NativeFile | IO) -> None: ... @property def metadata(self) -> KeyValueMetadata: ... @property @@ -26,11 +20,11 @@ class ORCFile: @property def software_version(self) -> str: ... @property - def compression(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... @property def compression_size(self) -> int: ... @property - def writer(self) -> str | int: ... + def writer(self) -> str: ... @property def writer_version(self) -> str: ... @property @@ -47,53 +41,51 @@ class ORCFile: def file_postscript_length(self) -> int: ... @property def file_length(self) -> int: ... - def read_stripe(self, n: int, columns: list[str] | None = ...) -> RecordBatch: ... - def read(self, columns: list[str] | None = ...) -> Table: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... class ORCWriter: - __doc__: str + writer: _orc.ORCWriter is_open: bool - writer: _ORCWriter def __init__( self, - where: str | NativeFile | IOBase, + where: str | NativeFile | IO, *, - file_version: str = ..., - batch_size: int = ..., - stripe_size: int = ..., - compression: str = ..., - compression_block_size: int = ..., - compression_strategy: str = ..., - row_index_stride: int = ..., - padding_tolerance: float = ..., - dictionary_key_size_threshold: float = ..., - bloom_filter_columns: list[str] | None = ..., - bloom_filter_fpp: float = ..., - ) -> None: ... - def __del__(self) -> None: ... - def __enter__(self) -> ORCWriter: ... + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... def write(self, table: Table) -> None: ... def close(self) -> None: ... def read_table( - source: str | NativeFile | IOBase, - columns: list[str] | None = ..., - filesystem: str | FileSystem | None = ..., + source: str | NativeFile | IO, + columns: list[str] | None = None, + filesystem: FileSystem | None = None, ) -> Table: ... def write_table( table: Table, - where: str | NativeFile | IOBase, + where: str | NativeFile | IO, *, - file_version: str = ..., - batch_size: int = ..., - stripe_size: int = ..., - compression: str = ..., - compression_block_size: int = ..., - compression_strategy: str = ..., - row_index_stride: int = ..., - padding_tolerance: float = ..., - dictionary_key_size_threshold: float = ..., - bloom_filter_columns: list[str] | None = ..., - bloom_filter_fpp: float = ..., + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, ) -> None: ... diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi index 76655afd361..453f48138f9 100644 --- a/pyarrow-stubs/pandas_compat.pyi +++ b/pyarrow-stubs/pandas_compat.pyi @@ -1,31 +1,28 @@ -from typing import Any -from typing import Callable +from typing import Any, TypedDict, TypeVar -import numpy as np import pandas as pd -from pandas.core.internals import BlockManager -from pyarrow.lib import Array -from pyarrow.lib import DataType -from pyarrow.lib import Schema -from pyarrow.lib import Table -from pyarrow.lib import _ArrowType -from pyarrow.lib import frombytes as frombytes -from typing_extensions import TypedDict +from pandas import DatetimeTZDtype -class _SerializedDict(TypedDict): - blocks: list[Any] - axes: list[Any] +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None -def get_logical_type_map() -> dict[_ArrowType, str]: ... -def get_logical_type(arrow_type: _ArrowType) -> str: ... -def get_logical_type_from_numpy(pandas_collection: pd.Series | pd.Index) -> str: ... -def get_extension_dtype_info( - column: pd.Series | pd.Index, -) -> tuple[str, dict[str, Any] | None]: ... def get_column_metadata( column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str -) -> dict[str, Any]: ... +) -> _ColumnMetadata: ... def construct_metadata( columns_to_convert: list[pd.Series], df: pd.DataFrame, @@ -36,27 +33,19 @@ def construct_metadata( types: list[DataType], ) -> dict[bytes, bytes]: ... def dataframe_to_types( - df: pd.DataFrame, preserve_index: bool, columns: list[str] | None = ... + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None ) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... def dataframe_to_arrays( df: pd.DataFrame, schema: Schema, - preserve_index: bool, - nthreads: int = ..., - columns: list[str] | None = ..., - safe: bool = ..., -) -> tuple[Array, Schema, int | None]: ... -def get_datetimetz_type( - values: pd.Series | pd.Index, dtype: np.dtype, type_: DataType | None -) -> tuple[pd.Series | pd.Index, DataType]: ... -def dataframe_to_serialized_dict(frame: pd.DataFrame) -> _SerializedDict: ... -def serialized_dict_to_dataframe(data: _SerializedDict) -> pd.DataFrame: ... -def make_datetimetz(tz: str) -> pd.DatetimeTZDtype: ... -def table_to_blockmanager( - options: dict, - table: Table, - categories: list[str] | None = ..., - ignore_metadata: bool = ..., - types_mapper: Callable[[DataType], np.generic] | None = ..., -) -> BlockManager: ... + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/pyarrow-stubs/parquet/__init__.pyi b/pyarrow-stubs/parquet/__init__.pyi index 151ee188f84..e69de29bb2d 100644 --- a/pyarrow-stubs/parquet/__init__.pyi +++ b/pyarrow-stubs/parquet/__init__.pyi @@ -1 +0,0 @@ -from .core import * # noqa diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 30494d61ea8..1611c822a18 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -1,402 +1,292 @@ -import pathlib +from pathlib import Path +from typing import IO, Callable, Iterator, Literal, Self, Sequence -from io import IOBase -from os import PathLike -from typing import Callable -from typing import Generator -from typing import Generic -from typing import TypeVar +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import Compression, FilterTuple +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated -import pyarrow +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) -from _typeshed import Incomplete -from pyarrow import Array -from pyarrow import NativeFile -from pyarrow import RecordBatch -from pyarrow import Schema -from pyarrow import Table -from pyarrow._parquet import ColumnChunkMetaData as ColumnChunkMetaData -from pyarrow._parquet import ColumnSchema as ColumnSchema -from pyarrow._parquet import FileDecryptionProperties as FileDecryptionProperties -from pyarrow._parquet import FileEncryptionProperties as FileEncryptionProperties -from pyarrow._parquet import FileMetaData as FileMetaData -from pyarrow._parquet import ParquetLogicalType as ParquetLogicalType -from pyarrow._parquet import ParquetReader as ParquetReader -from pyarrow._parquet import ParquetSchema as ParquetSchema -from pyarrow._parquet import RowGroupMetaData as RowGroupMetaData -from pyarrow._parquet import Statistics as Statistics -from pyarrow.compute import Expression -from pyarrow.dataset import Partitioning -from pyarrow.fs import FileSystem -from typing_extensions import Literal -from typing_extensions import TypeAlias - -def filters_to_expression( - filters: list[tuple[str, str, str] | list[tuple[str, str, str]]], -) -> Expression: ... +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... +@deprecated("use filters_to_expression") +def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... class ParquetFile: reader: ParquetReader - common_metadata: FileMetaData | None + common_metadata: FileMetaData + def __init__( self, - source: str | PathLike | pyarrow.NativeFile | IOBase, + source: str | Path | NativeFile | IO, *, - metadata: FileMetaData | None = ..., - common_metadata: FileMetaData | None = ..., - read_dictionary: list[str] | None = ..., - memory_map: bool = ..., - buffer_size: int = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., - decryption_properties: FileDecryptionProperties | None = ..., - thrift_string_size_limit: int | None = ..., - thrift_container_size_limit: int | None = ..., - ) -> None: ... - def __enter__(self) -> ParquetFile: ... + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: FileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... @property - def metadata(self) -> FileMetaData | None: ... + def metadata(self) -> FileMetaData: ... @property def schema(self) -> ParquetSchema: ... @property - def schema_arrow(self) -> pyarrow.Schema: ... + def schema_arrow(self) -> Schema: ... @property def num_row_groups(self) -> int: ... - def close(self, force: bool = ...) -> None: ... + def close(self, force: bool = False) -> None: ... @property def closed(self) -> bool: ... def read_row_group( self, i: int, - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> pyarrow.Table: ... + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... def read_row_groups( self, - row_groups: list[str], - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> pyarrow.Table: ... + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... def iter_batches( self, - batch_size: int = ..., - row_groups: list[str] | None = ..., - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> Generator[pyarrow.RecordBatch, None, None]: ... + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... def read( self, - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> pyarrow.Table: ... - def scan_contents(self, columns: list[int] | None = ..., batch_size: int = ...) -> int: ... - -_COMPRESSION: TypeAlias = Literal["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"] + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: ... class ParquetWriter: - flavor: Literal["spark"] | None + flavor: str schema_changed: bool - schema: pyarrow.Schema - where: str | PathLike | IOBase - file_handle: Incomplete - writer: Incomplete + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter is_open: bool + def __init__( self, - where: str | PathLike | IOBase, - schema: pyarrow.Schema, - filesystem: FileSystem | None = ..., - flavor: Literal["spark"] | None = ..., - version: str = ..., - use_dictionary: bool | list[str] = ..., - compression: _COMPRESSION | dict[str, _COMPRESSION] = ..., - write_statistics: bool | list[bool] = ..., - use_deprecated_int96_timestamps: bool | None = ..., - compression_level: int | dict[str, int] | None = ..., - use_byte_stream_split: bool | list[str] = ..., - column_encoding: str | dict[str, str] | None = ..., - writer_engine_version: str | None = ..., + where: str | Path | IO, + schema: Schema, + filesystem: FileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: Compression = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, data_page_version: Literal["1.0", "2.0"] = ..., - use_compliant_nested_type: bool = ..., - encryption_properties: FileEncryptionProperties | None = ..., - write_batch_size: int | None = ..., - dictionary_pagesize_limit: int | None = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, **options, ) -> None: ... - def __del__(self) -> None: ... - def __enter__(self) -> ParquetWriter: ... - def __exit__(self, *args, **kwargs): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... def write( - self, - table_or_batch: Table | RecordBatch, - row_group_size: int | None = ..., + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None ) -> None: ... - def write_batch(self, batch: RecordBatch, row_group_size: int | None = ...) -> None: ... - def write_table(self, table: Table, row_group_size: int | None = ...) -> None: ... + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... def close(self) -> None: ... - -class ParquetDatasetPiece: - def __init__( - self, - path: str | pathlib.Path, - open_file_func: Callable = ..., - file_options: dict | None = ..., - row_group: int | None = ..., - partition_keys: list[tuple[str, str]] | None = ..., - ) -> None: ... - def __eq__(self, other) -> bool: ... - def get_metadata(self) -> FileMetaData: ... - def open(self) -> ParquetFile: ... - def read( - self, - columns: list[str] | None = ..., - use_threads: bool = ..., - partitions: ParquetPartitions | None = ..., - file: IOBase | None = ..., - use_pandas_metadata: bool = ..., - ) -> Table: ... - -_K = TypeVar("_K") - -class PartitionSet(Generic[_K]): - name: str - keys: list[_K] - key_indices: dict[_K, int] - def __init__(self, name: str, keys: list[_K] | None = ...) -> None: ... - def get_index(self, key: _K) -> int: ... - @property - def dictionary(self) -> Array: ... - @property - def is_sorted(self) -> bool: ... - -_PPK = TypeVar("_PPK", str, int) - -class ParquetPartitions(Generic[_PPK]): - levels: list[PartitionSet[_PPK]] - partition_names: set[str] - def __init__(self) -> None: ... - def __len__(self) -> int: ... - def __getitem__(self, i): ... - def equals(self, other: ParquetPartitions) -> bool: ... - def __eq__(self, other) -> bool: ... - def get_index(self, level: int, name: str, key: _PPK) -> int: ... - def filter_accepts_partition(self, part_key, filter, level: int) -> bool: ... - -class ParquetManifest: - filesystem: Incomplete - open_file_func: Incomplete - pathsep: Incomplete - dirpath: Incomplete - partition_scheme: Incomplete - partitions: Incomplete - pieces: Incomplete - common_metadata_path: Incomplete - metadata_path: Incomplete - def __init__( - self, - dirpath, - open_file_func: Incomplete | None = ..., - filesystem: Incomplete | None = ..., - pathsep: str = ..., - partition_scheme: str = ..., - metadata_nthreads: int = ..., - ) -> None: ... - -class _ParquetDatasetMetadata: ... + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... class ParquetDataset: - paths: list[str] - split_row_groups: bool - - def __new__( - cls, - path_or_paths: str | list[str] | None = ..., - filesystem: FileSystem | None = ..., - schema: Schema | None = ..., - metadata: FileMetaData | None = ..., - split_row_groups: bool = ..., - validate_schema: bool = ..., - filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., - metadata_nthreads: int | None = ..., - read_dictionary: list[str] | None = ..., - memory_map: bool = ..., - buffer_size: int = ..., - partitioning: str = ..., - use_legacy_dataset: bool | None = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., - thrift_string_size_limit: int | None = ..., - thrift_container_size_limit: int | None = ..., - ): ... - def equals(self, other) -> bool: ... - def __eq__(self, other) -> bool: ... - def validate_schemas(self) -> None: ... - def read( - self, - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., - ) -> Table: ... - def read_pandas(self, **kwargs) -> Table: ... - @property - def pieces(self): ... - @property - def partitions(self): ... - @property - def schema(self): ... - @property - def memory_map(self): ... - @property - def read_dictionary(self): ... - @property - def buffer_size(self): ... - @property - def fs(self): ... - @property - def metadata(self): ... - @property - def metadata_path(self): ... - @property - def common_metadata_path(self): ... - @property - def common_metadata(self): ... - @property - def fragments(self) -> None: ... - @property - def files(self) -> None: ... - @property - def filesystem(self) -> None: ... - @property - def partitioning(self) -> None: ... - -class _ParquetDatasetV2: def __init__( self, path_or_paths: str | list[str], - filesystem: FileSystem | None = ..., + filesystem: FileSystem | None = None, + schema: Schema | None = None, *, - filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., - partitioning: str = ..., - read_dictionary: list[str] | None = ..., - buffer_size: int | None = ..., - memory_map: bool = ..., - ignore_prefixes: list[str] | None = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., - schema: Schema | None = ..., - decryption_properties: FileDecryptionProperties | None = ..., - thrift_string_size_limit: Incomplete | None = ..., - thrift_container_size_limit: Incomplete | None = ..., - **kwargs, - ) -> None: ... + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + use_legacy_dataset: bool | None = None, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... @property def schema(self) -> Schema: ... def read( self, - columns: list[str] | None = ..., - use_threads: bool = ..., - use_pandas_metadata: bool = ..., + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, ) -> Table: ... def read_pandas(self, **kwargs) -> Table: ... @property - def pieces(self): ... - @property - def fragments(self): ... + def fragments(self) -> list[ParquetFileFragment]: ... @property - def files(self): ... + def files(self) -> list[str]: ... @property - def filesystem(self): ... + def filesystem(self) -> FileSystem: ... @property - def partitioning(self): ... + def partitioning(self) -> Partitioning: ... def read_table( - source: str | NativeFile | IOBase, + source: str | Path | NativeFile | IO, *, - columns: list[str] | None = ..., - use_threads: bool = ..., - metadata: FileMetaData | None = ..., - schema: Schema | None = ..., - use_pandas_metadata: bool = ..., - memory_map: bool = ..., - read_dictionary: list[str] | None = ..., - filesystem: FileSystem | None = ..., - filters: list[tuple[str, str, str] | list[tuple[str, str, str]]] | None = ..., - buffer_size: int = ..., - partitioning: str = ..., - use_legacy_dataset: bool = ..., - ignore_prefixes: list[str] | None = ..., - pre_buffer: bool = ..., - coerce_int96_timestamp_unit: Literal["ms", "ns"] | None = ..., - decryption_properties: FileDecryptionProperties | None = ..., - thrift_string_size_limit: int | None = ..., - thrift_container_size_limit: int | None = ..., + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning = "hive", + filesystem: FileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + use_legacy_dataset: bool | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, ) -> Table: ... def read_pandas( - source: str | NativeFile | IOBase, columns: list[str] | None = ..., **kwargs + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs ) -> Table: ... def write_table( table: Table, - where: str | NativeFile, - row_group_size: int | None = ..., - version: str = ..., - use_dictionary: bool | list[str] = ..., - compression: str = ..., - write_statistics: bool = ..., - use_deprecated_int96_timestamps: bool | None = ..., - coerce_timestamps: str | None = ..., - allow_truncated_timestamps: bool = ..., - data_page_size: int | None = ..., - flavor: Literal["spark"] | None = ..., - filesystem: FileSystem | None = ..., - compression_level: int | dict[str, int] | None = ..., - use_byte_stream_split: bool = ..., - column_encoding: str | dict[str, str] | None = ..., - data_page_version: str = ..., - use_compliant_nested_type: bool = ..., - encryption_properties: FileEncryptionProperties | None = ..., - write_batch_size: int | None = ..., - dictionary_pagesize_limit: int | None = ..., + where: str | Path | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: Compression = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: FileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, **kwargs, ) -> None: ... def write_to_dataset( table: Table, - root_path: str | pathlib.Path, - partition_cols: list[str] | None = ..., - partition_filename_cb: Callable | None = ..., - filesystem: FileSystem | None = ..., - use_legacy_dataset: bool | None = ..., - schema: Schema | None = ..., - partitioning: list[str] | Partitioning | None = ..., - basename_template: str | None = ..., - use_threads: bool | None = ..., - file_visitor: Callable | None = ..., + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: FileSystem | None = None, + use_legacy_dataset: bool | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] - | None = ..., + | None = None, **kwargs, ) -> None: ... def write_metadata( schema: Schema, where: str | NativeFile, - metadata_collector: list | None = ..., + metadata_collector: list[FileMetaData] | None = None, + filesystem: FileSystem | None = None, **kwargs, ) -> None: ... def read_metadata( - where, - memory_map: bool = ..., - decryption_properties: FileDecryptionProperties | None = ..., - filesystem: Incomplete | None = ..., + where: str | Path | IO, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: FileSystem | None = None, ) -> FileMetaData: ... def read_schema( - where: str | IOBase, - memory_map: bool = ..., - decryption_properties: FileDecryptionProperties | None = ..., - filesystem: FileSystem | None = ..., -) -> Schema: ... - -# Names in __all__ with no definition: -# _filters_to_expression + where: str | Path | IO, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: FileSystem | None = None, +) -> FileMetaData: ... diff --git a/pyarrow-stubs/parquet/encryption.pyi b/pyarrow-stubs/parquet/encryption.pyi index 713edb3aa6a..5a77dae7ef7 100644 --- a/pyarrow-stubs/parquet/encryption.pyi +++ b/pyarrow-stubs/parquet/encryption.pyi @@ -1,5 +1,15 @@ -from pyarrow._parquet_encryption import CryptoFactory as CryptoFactory -from pyarrow._parquet_encryption import DecryptionConfiguration as DecryptionConfiguration -from pyarrow._parquet_encryption import EncryptionConfiguration as EncryptionConfiguration -from pyarrow._parquet_encryption import KmsClient as KmsClient -from pyarrow._parquet_encryption import KmsConnectionConfig as KmsConnectionConfig +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/pyarrow-stubs/plasma.pyi b/pyarrow-stubs/plasma.pyi deleted file mode 100644 index 76f41c9d7a8..00000000000 --- a/pyarrow-stubs/plasma.pyi +++ /dev/null @@ -1,26 +0,0 @@ -from collections.abc import Generator -from subprocess import Popen -from types import ModuleType - -from pyarrow._plasma import ObjectID as ObjectID -from pyarrow._plasma import ObjectNotAvailable as ObjectNotAvailable -from pyarrow._plasma import PlasmaBuffer as PlasmaBuffer -from pyarrow._plasma import PlasmaClient as PlasmaClient -from pyarrow._plasma import PlasmaObjectExists as PlasmaObjectExists -from pyarrow._plasma import PlasmaObjectNotFound as PlasmaObjectNotFound -from pyarrow._plasma import PlasmaStoreFull as PlasmaStoreFull -from pyarrow._plasma import connect as connect - -TF_PLASMA_OP_PATH: str -tf_plasma_op: ModuleType | None - -def load_plasma_tensorflow_op() -> None: ... -def build_plasma_tensorflow_op() -> None: ... -def start_plasma_store( - plasma_store_memory: int, - use_valgrind: bool = ..., - use_profiler: bool = ..., - plasma_directory: str | None = ..., - use_hugepages: bool = ..., - external_store: str | None = ..., -) -> Generator[tuple[str, Popen[str]], None, None]: ... diff --git a/pyarrow-stubs/serialization.pyi b/pyarrow-stubs/serialization.pyi deleted file mode 100644 index f40af9d8c4c..00000000000 --- a/pyarrow-stubs/serialization.pyi +++ /dev/null @@ -1,16 +0,0 @@ -from pyarrow.lib import SerializationContext as SerializationContext -from pyarrow.lib import builtin_pickle as builtin_pickle -from pyarrow.lib import py_buffer as py_buffer - -try: - import cloudpickle # type: ignore -except ImportError: - cloudpickle = builtin_pickle - -def register_torch_serialization_handlers( - serialization_context: SerializationContext, -): ... -def register_default_serialization_handlers( - serialization_context: SerializationContext, -) -> None: ... -def default_serialization_context() -> SerializationContext: ... diff --git a/pyarrow-stubs/substrait.pyi b/pyarrow-stubs/substrait.pyi index da9956b89d7..860fe70b827 100644 --- a/pyarrow-stubs/substrait.pyi +++ b/pyarrow-stubs/substrait.pyi @@ -1,2 +1,15 @@ -from pyarrow._substrait import get_supported_functions as get_supported_functions -from pyarrow._substrait import run_query as run_query +from pyarrow._substrait import ( + BoundExpressions, + deserialize_expressions, + get_supported_functions, + run_query, + serialize_expressions, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", +] diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi index 8c981fde8ef..23f0a8984fe 100644 --- a/pyarrow-stubs/types.pyi +++ b/pyarrow-stubs/types.pyi @@ -1,7 +1,4 @@ from pyarrow.lib import DataType -from pyarrow.lib import is_boolean_value as is_boolean_value -from pyarrow.lib import is_float_value as is_float_value -from pyarrow.lib import is_integer_value as is_integer_value def is_null(t: DataType) -> bool: ... def is_boolean(t: DataType) -> bool: ... @@ -23,9 +20,12 @@ def is_float64(t: DataType) -> bool: ... def is_list(t: DataType) -> bool: ... def is_large_list(t: DataType) -> bool: ... def is_fixed_size_list(t: DataType) -> bool: ... +def is_list_view(t: DataType) -> bool: ... +def is_large_list_view(t: DataType) -> bool: ... def is_struct(t: DataType) -> bool: ... def is_union(t: DataType) -> bool: ... def is_nested(t: DataType) -> bool: ... +def is_run_end_encoded(t: DataType) -> bool: ... def is_temporal(t: DataType) -> bool: ... def is_timestamp(t: DataType) -> bool: ... def is_duration(t: DataType) -> bool: ... @@ -39,6 +39,8 @@ def is_string(t: DataType) -> bool: ... def is_large_unicode(t: DataType) -> bool: ... def is_large_string(t: DataType) -> bool: ... def is_fixed_size_binary(t: DataType) -> bool: ... +def is_binary_view(t: DataType) -> bool: ... +def is_string_view(t: DataType) -> bool: ... def is_date(t: DataType) -> bool: ... def is_date32(t: DataType) -> bool: ... def is_date64(t: DataType) -> bool: ... diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi index cd8f7f2b5fc..00d27837c04 100644 --- a/pyarrow-stubs/util.pyi +++ b/pyarrow-stubs/util.pyi @@ -1,13 +1,25 @@ -from collections.abc import Sequence -from typing import Callable -from typing import TypeVar +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar -_T = TypeVar("_T") +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") -def implements(f: Callable): ... -def product(seq: Sequence[_T]) -> _T: ... +class _DocStringComponents(Protocol): + _docstring_components: list[str] + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... def get_contiguous_span( shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int ) -> tuple[int, int]: ... def find_free_port() -> int: ... def guid() -> str: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... diff --git a/pyproject.toml b/pyproject.toml index 7fe9a1fabce..e70801962f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] requires-python = ">=3.8,<4" +dependencies = ["pyarrow >=17"] [project.urls] homepage = "https://github.com/zen-xu/pyarrow-stubs" @@ -37,9 +38,14 @@ platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] [tool.pixi.pypi-dependencies] pyarrow-stubs = { path = ".", editable = true } +ipython = "*" +scipy = "*" pre-commit = "*" mypy = ">=1.11" ruff = ">=0.5" +types-cffi = "*" +pandas-stubs = "*" +hatchling = "*" [tool.ruff] fix = true @@ -47,12 +53,18 @@ line-length = 99 target-version = "py38" [tool.ruff.lint] -select = [ - "I", # isort +extend-select = [ + "I", # isort + "N", # pep8-naming + "PYI", # flake8-pyi +] +ignore = [ + "PYI015", # assignment-default-in-stub + "PYI011", # typed-argument-default-in-stub + "N818", # error-suffix-on-exception-name ] [tool.ruff.lint.isort] -force-single-line = true lines-after-imports = 2 lines-between-types = 1 From 0f10f86add21331157e40862a7b6a2cc615f7b21 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 12:51:37 +0800 Subject: [PATCH 050/231] remove check-mypy.sh (#49) --- check-mypy.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100755 check-mypy.sh diff --git a/check-mypy.sh b/check-mypy.sh deleted file mode 100755 index 25fae94bd73..00000000000 --- a/check-mypy.sh +++ /dev/null @@ -1,2 +0,0 @@ -#! /bin/bash -mypy pyarrow-stubs From 619f3991acf53b4300ecc0f67df486d503880230 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 13:12:36 +0800 Subject: [PATCH 051/231] release 20240828 (#50) --- .github/workflows/release.yaml | 15 +++++---------- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index de0d87b1833..eb617672588 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,19 +5,14 @@ on: tags: - "*.*.*" +env: + HATCH_INDEX_USER: __token__ + HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} + jobs: release: name: "release ${{github.ref_name}}" runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: "3.7" - - uses: abatilo/actions-poetry@v2 - with: - poetry-version: "1.4.2" - - name: publish - run: | - poetry build -f wheel - poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }} + - uses: quality-specialist/hatch-action@v1 diff --git a/pyproject.toml b/pyproject.toml index e70801962f5..ab1a7fc11b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "10.0.1.9" +version = "20240828" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 0bd8b1cd1b12979a478e0384ae00c451c50ac453 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 13:14:17 +0800 Subject: [PATCH 052/231] fix release tag (#51) --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index eb617672588..81a7c0a8ad3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -3,7 +3,7 @@ name: Release on: push: tags: - - "*.*.*" + - "*" env: HATCH_INDEX_USER: __token__ From 681c84bc58986f66e6c382b5dcdd44c42c95695a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 13:23:37 +0800 Subject: [PATCH 053/231] ci: install hatch by pip (#52) --- .github/workflows/release.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 81a7c0a8ad3..c2224c42022 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -15,4 +15,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: quality-specialist/hatch-action@v1 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install hatch + run: | + python -m pip install hatch + - name: Build dist + run: hatch build + - name: Publish on PyPI + run: hatch publish From 64c39f84d0df31adbf922fb6236b4ef57e47cdf1 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 13:32:37 +0800 Subject: [PATCH 054/231] ci: fix hatch keyring (#53) --- .github/workflows/release.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c2224c42022..27f5431fbdc 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,10 +5,6 @@ on: tags: - "*" -env: - HATCH_INDEX_USER: __token__ - HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} - jobs: release: name: "release ${{github.ref_name}}" @@ -25,4 +21,4 @@ jobs: - name: Build dist run: hatch build - name: Publish on PyPI - run: hatch publish + run: hatch publish -u __token__ -a ${{ secrets.PYPI_TOKEN }} From 14c5854b73d75490ade00404512cbeef889bb5f2 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 28 Aug 2024 13:41:10 +0800 Subject: [PATCH 055/231] ci: use Release environment (#54) --- .github/workflows/release.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 27f5431fbdc..ec40186f30d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -9,6 +9,8 @@ jobs: release: name: "release ${{github.ref_name}}" runs-on: ubuntu-latest + environment: + name: Release steps: - uses: actions/checkout@v3 - name: Set up Python From 9c157a63d8960db2489db369a76a14db116ed277 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 30 Aug 2024 10:48:43 +0800 Subject: [PATCH 056/231] remove Scalar generic type var _IsValid (#56) * remove Scalar generic type var _IsValid --- pixi.lock | 4 +- pyarrow-stubs/__lib_pxi/array.pyi | 10 +- pyarrow-stubs/__lib_pxi/scalar.pyi | 179 +++++++++++++---------------- pyarrow-stubs/__lib_pxi/table.pyi | 4 +- pyproject.toml | 1 + 5 files changed, 93 insertions(+), 105 deletions(-) diff --git a/pixi.lock b/pixi.lock index d7d491171fc..e2cbc0d415e 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: 10.0.1.9 + version: '20240828' path: . - sha256: 5c30ac8c8008518b3a446a57a76cfde327f6fc5b7d4ab9db5deea86294d4b3b2 + sha256: 94683bcd78fcecd7a11e79fd433e5bc498768bce83e286b25a9b50f6f943b83a requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 16aed96cd66..5a45018302c 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1,4 +1,4 @@ -# mypy: disable-error-code="overload-overlap" +# mypy: disable-error-code="overload-overlap,misc" import datetime as dt @@ -1189,8 +1189,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def is_nan(self) -> BooleanArray: ... def is_valid(self) -> BooleanArray: ... def fill_null( - self: Array[Scalar[_BasicDataType[_AsPyType], Any]], fill_value: _AsPyType - ) -> Array[Scalar[_BasicDataType[_AsPyType], Any]]: ... + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... @overload def __getitem__(self, key: int) -> _ScalarT: ... @overload @@ -1215,7 +1215,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): ) -> scalar.Int64Scalar: ... @overload def index( - self: Array[Scalar[_BasicDataType[_AsPyType], Any]], + self: Array[Scalar[_BasicDataType[_AsPyType]]], value: _AsPyType, start: int | None = None, end: int | None = None, @@ -1226,7 +1226,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: ... def to_pylist( - self: Array[Scalar[_BasicDataType[_AsPyType], Any]], + self: Array[Scalar[_BasicDataType[_AsPyType]]], ) -> list[_AsPyType | None]: ... tolist = to_pylist def validate(self, *, full: bool = False) -> None: ... diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 12fafd623bf..2b8babcc29b 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -3,7 +3,7 @@ import collections.abc import datetime as dt from decimal import Decimal -from typing import Any, Generic, Iterator, Literal, Mapping, Self, TypeAlias, overload +from typing import Any, Generic, Iterator, Mapping, Self, TypeAlias, overload import numpy as np @@ -12,17 +12,24 @@ from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakre from typing_extensions import TypeVar from . import types -from .types import _AsPyType, _DataTypeT, _NewDataTypeT, _Time32Unit, _Time64Unit, _Tz, _Unit +from .types import ( + _AsPyType, + _DataTypeT, + _NewDataTypeT, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) -_IsValid = TypeVar("_IsValid", default=Literal[True]) _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") -class Scalar(_Weakrefable, Generic[_DataTypeT, _IsValid]): +class Scalar(_Weakrefable, Generic[_DataTypeT]): @property def type(self) -> _DataTypeT: ... @property - def is_valid(self) -> _IsValid: ... + def is_valid(self) -> bool: ... @overload def cast( self, @@ -38,15 +45,15 @@ class Scalar(_Weakrefable, Generic[_DataTypeT, _IsValid]): safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Scalar[_NewDataTypeT, _IsValid]: ... + ) -> Scalar[_NewDataTypeT]: ... def validate(self, *, full: bool = False) -> None: ... def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @overload - def as_py(self: Scalar[types._BasicDataType[_AsPyType], Literal[True]]) -> _AsPyType: ... + def as_py(self: Scalar[types._BasicDataType[_AsPyType]]) -> _AsPyType: ... @overload def as_py( - self: Scalar[types.ListType[types._BasicDataType[_AsPyType]], Literal[True]], + self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], ) -> list[_AsPyType]: ... @overload def as_py( @@ -55,216 +62,193 @@ class Scalar(_Weakrefable, Generic[_DataTypeT, _IsValid]): types.DictionaryType[ types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV], Any ] - ], - Literal[True], + ] ], ) -> list[dict[_AsPyTypeK, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[ types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], - Literal[True], ], ) -> list[dict[Any, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[ types.ListType[types.DictionaryType[types._BasicDataType[_AsPyTypeK], Any, Any]], - Literal[True], ], ) -> list[dict[_AsPyTypeK, Any]]: ... @overload def as_py( - self: Scalar[types.StructType, Literal[True]], + self: Scalar[types.StructType], ) -> list[dict[str, Any]]: ... @overload def as_py( self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]], - Literal[True], + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] ], ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... @overload def as_py( - self: Scalar[ - types.MapType[Any, types._BasicDataType[_AsPyTypeV]], - Literal[True], - ], + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], ) -> list[tuple[Any, _AsPyTypeV]]: ... @overload def as_py( - self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], Any], - Literal[True], - ], + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], ) -> list[tuple[_AsPyTypeK, Any]]: ... @overload - def as_py(self: Scalar[Any, Literal[True]]) -> Any: ... - @overload - def as_py(self: Scalar[Any, Literal[False]]) -> None: ... + def as_py(self: Scalar[Any]) -> Any: ... _NULL: TypeAlias = None NA = _NULL -class NullScalar(Scalar[types.NullType, _IsValid]): ... -class BooleanScalar(Scalar[types.BoolType, _IsValid]): ... -class UInt8Scalar(Scalar[types.Uint8Type, _IsValid]): ... -class Int8Scalar(Scalar[types.Int8Type, _IsValid]): ... -class UInt16Scalar(Scalar[types.Uint16Type, _IsValid]): ... -class Int16Scalar(Scalar[types.Int16Type, _IsValid]): ... -class UInt32Scalar(Scalar[types.Uint32Type, _IsValid]): ... -class Int32Scalar(Scalar[types.Int32Type, _IsValid]): ... -class UInt64Scalar(Scalar[types.Uint64Type, _IsValid]): ... -class Int64Scalar(Scalar[types.Int64Type, _IsValid]): ... -class HalfFloatScalar(Scalar[types.Float16Type, _IsValid]): ... -class FloatScalar(Scalar[types.Float32Type, _IsValid]): ... -class DoubleScalar(Scalar[types.Float64Type, _IsValid]): ... -class Decimal128Scalar(Scalar[types.Decimal128Type, _IsValid]): ... -class Decimal256Scalar(Scalar[types.Decimal256Type, _IsValid]): ... -class Date32Scalar(Scalar[types.Date32Type, _IsValid]): ... - -class Date64Scalar(Scalar[types.Date64Type, _IsValid]): +class NullScalar(Scalar[types.NullType]): ... +class BooleanScalar(Scalar[types.BoolType]): ... +class UInt8Scalar(Scalar[types.Uint8Type]): ... +class Int8Scalar(Scalar[types.Int8Type]): ... +class UInt16Scalar(Scalar[types.Uint16Type]): ... +class Int16Scalar(Scalar[types.Int16Type]): ... +class UInt32Scalar(Scalar[types.Uint32Type]): ... +class Int32Scalar(Scalar[types.Int32Type]): ... +class UInt64Scalar(Scalar[types.Uint64Type]): ... +class Int64Scalar(Scalar[types.Int64Type]): ... +class HalfFloatScalar(Scalar[types.Float16Type]): ... +class FloatScalar(Scalar[types.Float32Type]): ... +class DoubleScalar(Scalar[types.Float64Type]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type]): ... +class Date32Scalar(Scalar[types.Date32Type]): ... + +class Date64Scalar(Scalar[types.Date64Type]): @property def value(self) -> dt.date | None: ... -class Time32Scalar(Scalar[types.Time32Type[_Time32Unit], _IsValid]): +class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): @property def value(self) -> dt.time | None: ... -class Time64Scalar(Scalar[types.Time64Type[_Time64Unit], _IsValid]): +class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): @property def value(self) -> dt.time | None: ... -class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz], _IsValid]): +class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): @property def value(self) -> int | None: ... -class DurationScalar(Scalar[types.DurationType[_Unit], _IsValid]): +class DurationScalar(Scalar[types.DurationType[_Unit]]): @property def value(self) -> dt.timedelta | None: ... -class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType, _IsValid]): +class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): @property def value(self) -> MonthDayNano | None: ... -class BinaryScalar(Scalar[types.BinaryType, _IsValid]): +class BinaryScalar(Scalar[types.BinaryType]): def as_buffer(self) -> Buffer: ... -class LargeBinaryScalar(Scalar[types.LargeBinaryType, _IsValid]): +class LargeBinaryScalar(Scalar[types.LargeBinaryType]): def as_buffer(self) -> Buffer: ... -class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType, _IsValid]): +class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): def as_buffer(self) -> Buffer: ... -class StringScalar(Scalar[types.StringType, _IsValid]): +class StringScalar(Scalar[types.StringType]): def as_buffer(self) -> Buffer: ... -class LargeStringScalar(Scalar[types.LargeStringType, _IsValid]): +class LargeStringScalar(Scalar[types.LargeStringType]): def as_buffer(self) -> Buffer: ... -class BinaryViewScalar(Scalar[types.BinaryViewType, _IsValid]): +class BinaryViewScalar(Scalar[types.BinaryViewType]): def as_buffer(self) -> Buffer: ... -class StringViewScalar(Scalar[types.StringViewType, _IsValid]): +class StringViewScalar(Scalar[types.StringViewType]): def as_buffer(self) -> Buffer: ... -class ListScalar(Scalar[types.ListType[_DataTypeT], _IsValid]): +class ListScalar(Scalar[types.ListType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size], _IsValid]): +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListScalar(Scalar[types.LargeListType[_DataTypeT], _IsValid]): +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class ListViewScalar(Scalar[types.ListViewType[_DataTypeT], _IsValid]): +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT], _IsValid]): +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT, _IsValid]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class StructScalar(Scalar[types.StructType, _IsValid], collections.abc.Mapping[str, Scalar]): +class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): def __len__(self) -> int: ... def __iter__(self) -> Iterator[str]: ... - def __getitem__(self, __key: str) -> Scalar[Any, _IsValid]: ... # type: ignore[override] + def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] def _as_py_tuple(self) -> list[tuple[str, Any]]: ... -class MapScalar(Scalar[types.MapType[types._K, types._ValueT], _IsValid]): +class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__( - self, i: int - ) -> tuple[Scalar[types._K, _IsValid], types._ValueT, Any, _IsValid]: ... + def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... @overload def __iter__( self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]], - _IsValid, + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] ], ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... @overload def __iter__( - self: Scalar[ - types.MapType[Any, types._BasicDataType[_AsPyTypeV]], - _IsValid, - ], + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... @overload def __iter__( - self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], Any], - _IsValid, - ], + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... -class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._ValueT], _IsValid]): +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._ValueT]]): @property - def index(self) -> Scalar[types._IndexT, _IsValid]: ... + def index(self) -> Scalar[types._IndexT]: ... @property - def value(self) -> Scalar[types._ValueT, _IsValid]: ... + def value(self) -> Scalar[types._ValueT]: ... @property def dictionary(self) -> Array: ... -class RunEndEncodedScalar( - Scalar[types.RunEndEncodedType[types._RunEndType, types._ValueT], _IsValid] -): +class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._ValueT]]): @property def value(self) -> tuple[int, int] | None: ... -class UnionScalar(Scalar[types.UnionType, _IsValid]): +class UnionScalar(Scalar[types.UnionType]): @property def value(self) -> Any | None: ... @property def type_code(self) -> str: ... -class ExtensionScalar(Scalar[types.ExtensionType, _IsValid]): +class ExtensionScalar(Scalar[types.ExtensionType]): @property def value(self) -> Any | None: ... @staticmethod def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... -class FixedShapeTensorScalar(ExtensionScalar[_IsValid]): +class FixedShapeTensorScalar(ExtensionScalar): def to_numpy(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -309,8 +293,11 @@ def scalar( value: dt.timedelta, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None ) -> DurationScalar: ... @overload -def scalar( # type: ignore[overload-overlap] - value: MonthDayNano, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +def scalar( + value: MonthDayNano, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> MonthDayNanoIntervalScalar: ... @overload def scalar( @@ -391,19 +378,19 @@ def scalar( ) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... @overload def scalar( - value: CollectionValue[_V], + value: CollectionValue, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[Any]: ... @overload def scalar( - value: _V, + value: Any, type: _DataTypeT, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> Scalar[_DataTypeT, _V]: ... +) -> Scalar[_DataTypeT]: ... __all__ = [ "Scalar", diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 10cc70f105f..fb8a95dd2a9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -100,7 +100,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop"): ... @overload def index( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType], Any]], + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], value: Scalar[_DataTypeT] | _AsPyType, start: int | None = None, end: int | None = None, @@ -128,7 +128,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def iterchunks(self) -> Generator[Array[_ScalarT], None, None]: ... def __iter__(self) -> Iterator[Array[_ScalarT]]: ... def to_pylist( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType], Any]], + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], ) -> list[_AsPyType | None]: ... def __arrow_c_stream__(self, requested_schema=None) -> Any: ... @classmethod diff --git a/pyproject.toml b/pyproject.toml index ab1a7fc11b6..f530f58fb3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,3 +76,4 @@ explicit_package_bases = true files = "pyarrow-stubs" namespace_packages = true show_error_codes = true +disable_error_code = ["overload-overlap", "import-not-found"] From 9e85dd0a25792acbcaec714a5f8f364e04ab7554 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 30 Aug 2024 11:23:08 +0800 Subject: [PATCH 057/231] make Array, Scalar, Types generic type var as covariant type (#57) --- pyarrow-stubs/__lib_pxi/array.pyi | 32 +++++++++++++------------ pyarrow-stubs/__lib_pxi/scalar.pyi | 34 +++++++++++++------------- pyarrow-stubs/__lib_pxi/table.pyi | 20 +++++++++------- pyarrow-stubs/__lib_pxi/types.pyi | 38 +++++++++++++----------------- 4 files changed, 62 insertions(+), 62 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 5a45018302c..5b4fd38f78c 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -40,6 +40,7 @@ from .types import ( MapType, _AsPyType, _BasicDataType, + _DataType_CoT, _DataTypeT, _IndexT, _RunEndType, @@ -1124,8 +1125,9 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): _CastAs = TypeVar("_CastAs", bound=DataType) _ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) -class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): def diff(self, other: Self) -> str: ... def cast( self, @@ -1135,7 +1137,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): memory_pool: MemoryPool | None = None, ) -> Array[Scalar[_CastAs]]: ... def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... - def sum(self, **kwargs) -> _ScalarT: ... + def sum(self, **kwargs) -> _Scalar_CoT: ... def unique(self) -> Self: ... def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... @overload @@ -1172,7 +1174,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def nbytes(self) -> int: ... def get_total_buffer_size(self) -> int: ... def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_ScalarT]: ... + def __iter__(self) -> Iterator[_Scalar_CoT]: ... def to_string( self, *, @@ -1192,7 +1194,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... @overload - def __getitem__(self, key: int) -> _ScalarT: ... + def __getitem__(self, key: int) -> _Scalar_CoT: ... @overload def __getitem__(self, key: slice) -> Self: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... @@ -1206,7 +1208,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): ) -> Self: ... @overload def index( - self, + self: Array[_ScalarT], value: _ScalarT, start: int | None = None, end: int | None = None, @@ -1260,9 +1262,9 @@ class BooleanArray(Array[scalar.BooleanScalar]): @property def true_count(self) -> int: ... -class NumericArray(Array[_ScalarT]): ... -class IntegerArray(NumericArray[_ScalarT]): ... -class FloatingPointArray(NumericArray[_ScalarT]): ... +class NumericArray(Array[_Scalar_CoT]): ... +class IntegerArray(NumericArray[_Scalar_CoT]): ... +class FloatingPointArray(NumericArray[_Scalar_CoT]): ... class Int8Array(IntegerArray[scalar.Int8Scalar]): ... class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... class Int16Array(IntegerArray[scalar.Int16Scalar]): ... @@ -1285,12 +1287,12 @@ class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... class Decimal128Array(FixedSizeBinaryArray): ... class Decimal256Array(FixedSizeBinaryArray): ... -class BaseListArray(Array[_ScalarT]): +class BaseListArray(Array[_Scalar_CoT]): def flatten(self, recursive: bool = False) -> Array: ... def value_parent_indices(self) -> Int64Array: ... def value_lengths(self) -> Int32Array: ... -class ListArray(BaseListArray[_ScalarT]): +class ListArray(BaseListArray[_Scalar_CoT]): @overload @classmethod def from_arrays( @@ -1318,7 +1320,7 @@ class ListArray(BaseListArray[_ScalarT]): @property def offsets(self) -> Int32Array: ... -class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): @overload @classmethod def from_arrays( @@ -1346,7 +1348,7 @@ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @property def offsets(self) -> Int64Array: ... -class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataType_CoT]]): @overload @classmethod def from_arrays( @@ -1376,7 +1378,7 @@ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): @property def sizes(self) -> Int32Array: ... -class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): @overload @classmethod def from_arrays( @@ -1406,7 +1408,7 @@ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @property def sizes(self) -> Int64Array: ... -class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataType_CoT, _Size]]): @overload @classmethod def from_arrays( @@ -1427,7 +1429,7 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S mask: Mask | None = None, ) -> FixedSizeListArray[_DataTypeT, _Size]: ... @property - def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: ... + def values(self) -> BaseListArray[scalar.ListScalar[_DataType_CoT]]: ... _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 2b8babcc29b..11887c06aa4 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -1,4 +1,4 @@ -# mypy: disable-error-code="overload-overlap" +# mypy: disable-error-code="overload-overlap,misc" import collections.abc import datetime as dt @@ -14,8 +14,8 @@ from typing_extensions import TypeVar from . import types from .types import ( _AsPyType, + _DataType_CoT, _DataTypeT, - _NewDataTypeT, _Time32Unit, _Time64Unit, _Tz, @@ -25,9 +25,9 @@ from .types import ( _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") -class Scalar(_Weakrefable, Generic[_DataTypeT]): +class Scalar(_Weakrefable, Generic[_DataType_CoT]): @property - def type(self) -> _DataTypeT: ... + def type(self) -> _DataType_CoT: ... @property def is_valid(self) -> bool: ... @overload @@ -41,11 +41,11 @@ class Scalar(_Weakrefable, Generic[_DataTypeT]): @overload def cast( self, - target_type: _NewDataTypeT, + target_type: _DataTypeT, safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Scalar[_NewDataTypeT]: ... + ) -> Scalar[_DataTypeT]: ... def validate(self, *, full: bool = False) -> None: ... def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @@ -163,39 +163,39 @@ class BinaryViewScalar(Scalar[types.BinaryViewType]): class StringViewScalar(Scalar[types.StringViewType]): def as_buffer(self) -> Buffer: ... -class ListScalar(Scalar[types.ListType[_DataTypeT]]): +class ListScalar(Scalar[types.ListType[_DataType_CoT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... def __iter__(self) -> Iterator[Array]: ... -class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataType_CoT, types._Size]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): +class LargeListScalar(Scalar[types.LargeListType[_DataType_CoT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... def __iter__(self) -> Iterator[Array]: ... -class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): +class ListViewScalar(Scalar[types.ListViewType[_DataType_CoT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataType_CoT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... def __iter__(self) -> Iterator[Array]: ... class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): @@ -321,7 +321,7 @@ def scalar( memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.BinaryType]]: ... @overload -def scalar( # type: ignore[overload-overlap] +def scalar( value: CollectionValue[bool], *, from_pandas: bool | None = None, diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index fb8a95dd2a9..a9ef5ae4eb5 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -43,9 +43,9 @@ from .scalar import Int64Scalar, Scalar from .tensor import Tensor from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT -_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) -class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @property def data(self) -> Self: ... @property @@ -71,7 +71,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): @overload def __getitem__(self, key: slice) -> Self: ... @overload - def __getitem__(self, key: int) -> _ScalarT: ... + def __getitem__(self, key: int) -> _Scalar_CoT: ... def getitem(self, i: int) -> Scalar: ... def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: ... def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: ... @@ -93,8 +93,10 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): ) -> ChunkedArray[Scalar[_CastAs]]: ... def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> ChunkedArray[_ScalarT]: ... - def unique(self) -> ChunkedArray[_ScalarT]: ... + def combine_chunks( + self, memory_pool: MemoryPool | None = None + ) -> ChunkedArray[_Scalar_CoT]: ... + def unique(self) -> ChunkedArray[_Scalar_CoT]: ... def value_counts(self) -> StructArray: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop"): ... @@ -122,11 +124,11 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... @property def num_chunks(self) -> int: ... - def chunk(self, i: int) -> ChunkedArray[_ScalarT]: ... + def chunk(self, i: int) -> ChunkedArray[_Scalar_CoT]: ... @property - def chunks(self) -> list[Array[_ScalarT]]: ... - def iterchunks(self) -> Generator[Array[_ScalarT], None, None]: ... - def __iter__(self) -> Iterator[Array[_ScalarT]]: ... + def chunks(self) -> list[Array[_Scalar_CoT]]: ... + def iterchunks(self) -> Generator[Array[_Scalar_CoT], None, None]: ... + def __iter__(self) -> Iterator[Array[_Scalar_CoT]]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], ) -> list[_AsPyType | None]: ... diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index d24b148af86..9ca7cc7871d 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -2,7 +2,7 @@ import datetime as dt from collections.abc import Mapping from decimal import Decimal -from typing import Any, Generic, Iterable, Iterator, Literal, Self, TypeAlias, overload +from typing import Any, Generic, Iterable, Iterator, Literal, Self, overload import numpy as np import pandas as pd @@ -20,7 +20,9 @@ from typing_extensions import TypeVar from .scalar import ExtensionScalar -CSchema: TypeAlias = Any +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) +_DataType_CoT = TypeVar("_DataType_CoT", bound=DataType, covariant=True) class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -47,8 +49,6 @@ class DataType(_Weakrefable): @classmethod def _import_from_c_capsule(cls, schema) -> Self: ... -_AsPyType = TypeVar("_AsPyType") - class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... @@ -115,19 +115,17 @@ class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): @property def scale(self) -> _Scale: ... -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) - -class ListType(DataType, Generic[_DataTypeT]): +class ListType(DataType, Generic[_DataType_CoT]): @property - def value_field(self) -> Field[_DataTypeT]: ... + def value_field(self) -> Field[_DataType_CoT]: ... @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataType_CoT: ... -class LargeListType(ListType[_DataTypeT]): ... -class ListViewType(ListType[_DataTypeT]): ... -class LargeListViewType(ListType[_DataTypeT]): ... +class LargeListType(ListType[_DataType_CoT]): ... +class ListViewType(ListType[_DataType_CoT]): ... +class LargeListViewType(ListType[_DataType_CoT]): ... -class FixedSizeListType(ListType[_DataTypeT], Generic[_DataTypeT, _Size]): +class FixedSizeListType(ListType[_DataType_CoT], Generic[_DataType_CoT, _Size]): @property def list_size(self) -> _Size: ... @@ -195,6 +193,8 @@ class RunEndEncodedType(DataType, Generic[_RunEndType, _ValueT]): @property def value_type(self) -> _ValueT: ... +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + class BaseExtensionType(DataType): def __arrow_ext_class__(self) -> type[ExtensionArray]: ... def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... @@ -202,10 +202,7 @@ class BaseExtensionType(DataType): def extension_name(self) -> str: ... @property def storage_type(self) -> DataType: ... - @overload - def wrap_array(self, storage: Array) -> Array: ... - @overload - def wrap_array(self, storage: ChunkedArray) -> ChunkedArray: ... + def wrap_array(self, storage: _StorageT) -> _StorageT: ... class ExtensionType(BaseExtensionType): def __init__(self, storage_type: DataType, extension_name: str) -> None: ... @@ -248,10 +245,9 @@ def ensure_metadata( meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False ) -> KeyValueMetadata | None: ... -_NewDataTypeT = TypeVar("_NewDataTypeT", bound=DataType) _Nullable = TypeVar("_Nullable", bound=Literal[True, False], default=Literal[True]) -class Field(_Weakrefable, Generic[_DataTypeT, _Nullable]): +class Field(_Weakrefable, Generic[_DataType_CoT, _Nullable]): def equals(self, other: Field, check_metadata: bool = False) -> bool: ... def __hash__(self) -> int: ... @property @@ -262,9 +258,9 @@ class Field(_Weakrefable, Generic[_DataTypeT, _Nullable]): def metadata(self) -> dict[bytes, bytes] | None: ... def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... def remove_metadata(self) -> None: ... - def with_type(self, new_type: _NewDataTypeT) -> Field[_NewDataTypeT]: ... + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... def with_name(self, name: str) -> Self: ... - def with_nullable(self, nullable: _Nullable) -> Field[_DataTypeT, _Nullable]: ... + def with_nullable(self, nullable: _Nullable) -> Field[_DataType_CoT, _Nullable]: ... def flatten(self) -> list[Field]: ... def _export_to_c(self, out_ptr: int) -> None: ... @classmethod From 5e1793698463012860311bb11c2591f0bc12e5f9 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 30 Aug 2024 15:25:49 +0800 Subject: [PATCH 058/231] remove Field generic type var _Nullable (#58) * remove Field generic type var _Nullable --- pyarrow-stubs/__lib_pxi/types.pyi | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 9ca7cc7871d..c973cac5169 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -147,7 +147,7 @@ _K = TypeVar("_K", bound=_BasicDataType) class MapType(DataType, Generic[_K, _ValueT, _Ordered]): @property - def key_field(self) -> Field[_K, Literal[False]]: ... + def key_field(self) -> Field[_K]: ... @property def key_type(self) -> _K: ... @property @@ -245,13 +245,11 @@ def ensure_metadata( meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False ) -> KeyValueMetadata | None: ... -_Nullable = TypeVar("_Nullable", bound=Literal[True, False], default=Literal[True]) - -class Field(_Weakrefable, Generic[_DataType_CoT, _Nullable]): +class Field(_Weakrefable, Generic[_DataType_CoT]): def equals(self, other: Field, check_metadata: bool = False) -> bool: ... def __hash__(self) -> int: ... @property - def nullable(self) -> _Nullable: ... + def nullable(self) -> bool: ... @property def name(self) -> str: ... @property @@ -260,7 +258,7 @@ class Field(_Weakrefable, Generic[_DataType_CoT, _Nullable]): def remove_metadata(self) -> None: ... def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... def with_name(self, name: str) -> Self: ... - def with_nullable(self, nullable: _Nullable) -> Field[_DataType_CoT, _Nullable]: ... + def with_nullable(self, nullable: bool) -> Field[_DataType_CoT]: ... def flatten(self) -> list[Field]: ... def _export_to_c(self, out_ptr: int) -> None: ... @classmethod @@ -320,13 +318,8 @@ def unify_schemas( def field(name: SupportArrowSchema) -> Field: ... @overload def field( - name: str, - type: _DataTypeT, -) -> Field[_DataTypeT, Literal[True]]: ... -@overload -def field( - name: str, type: _DataTypeT, nullable: _Nullable, metadata: dict | None = None -) -> Field[_DataTypeT, _Nullable]: ... + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict | None = None +) -> Field[_DataTypeT]: ... def null() -> NullType: ... def bool_() -> BoolType: ... def uint8() -> Uint8Type: ... From e301fde3c13747a5e7d1231f2a3e805cf90ad3ef Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 30 Aug 2024 15:33:35 +0800 Subject: [PATCH 059/231] fix: pa.dictionary and pa.schema annotation (#59) * fix pa.dictionary annotation * fix: schema annotation --- pyarrow-stubs/__lib_pxi/types.pyi | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index c973cac5169..eb6042d42f7 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -396,6 +396,11 @@ def map_(key_type: _K, item_type: _IndexT) -> MapType[_K, _IndexT, Literal[False def map_( key_type: _K, item_type: _IndexT, key_sorted: _Ordered ) -> MapType[_K, _IndexT, _Ordered]: ... +@overload +def dictionary( + index_type: _IndexT, value_type: _ValueT +) -> DictionaryType[_IndexT, _ValueT, Literal[False]]: ... +@overload def dictionary( index_type: _IndexT, value_type: _ValueT, ordered: _Ordered ) -> DictionaryType[_IndexT, _ValueT, _Ordered]: ... @@ -570,7 +575,7 @@ def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... @overload def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... def schema( - fields: Iterable[Field | tuple[str, Field]] | Mapping[str, Field], + fields: Iterable[Field] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], metadata: dict[bytes, bytes] | None = None, ) -> Schema: ... def from_numpy_dtype(dtype: np.dtype) -> DataType: ... From 534459aed469c08d0a529ec797650918c3fcb471 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 30 Aug 2024 15:36:33 +0800 Subject: [PATCH 060/231] release new version (#60) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index e2cbc0d415e..5df0ed40a94 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '20240828' + version: '20240830' path: . - sha256: 94683bcd78fcecd7a11e79fd433e5bc498768bce83e286b25a9b50f6f943b83a + sha256: 24c2556903074a52805ac3b09c5b9a860d4c347797b0c1cee4902fb40949c616 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index f530f58fb3a..54133e5020f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20240828" +version = "20240830" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From f7d5ca1858a4496964feef1bce93ec7f17eea968 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 07:12:41 +0800 Subject: [PATCH 061/231] [pre-commit.ci] pre-commit autoupdate (#62) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51a931e980a..bc6a3c31ae7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.2 + rev: v0.6.3 hooks: - id: ruff args: [--fix] From 43b8bc0e4040cf5b5ffb532d1236b28c2e799514 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 3 Sep 2024 15:52:11 +0800 Subject: [PATCH 062/231] release: 2024.9.3 (#63) use new date release format %Y.%m.%d --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 5df0ed40a94..ac44af0b7c6 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '20240830' + version: 2024.9.3 path: . - sha256: 24c2556903074a52805ac3b09c5b9a860d4c347797b0c1cee4902fb40949c616 + sha256: 47fcdf5b2b7ce4a312b108a2f2fb17749cdd695ff76dad2830db5d885175e0f8 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 54133e5020f..ff2593f40d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20240830" +version = "2024.9.3" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 343f221a15795e18ca6f8eabff443bd9d4416f3e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 4 Sep 2024 15:24:37 +0800 Subject: [PATCH 063/231] support pyarrow compute funcs (#61) * update compute.pyi * impl Aggregation funcs * impl arithmetic * imit bit-wise functions * imit rounding functions * optimize annotation * impl logarithmic functions * update * impl comparisons funcs * impl logical funcs * impl string predicates and transforms * impl string padding * impl string trimming * impl string splitting and component extraction * impl string joining and slicing * impl Containment tests * impl Categorizations * impl Structural transforms * impl Conversions * impl Temporal component extraction * impl random, Timezone handling * impl Array-wise functions * fix timestamp scalar --- pyarrow-stubs/__lib_pxi/scalar.pyi | 13 +- pyarrow-stubs/__lib_pxi/table.pyi | 7 +- pyarrow-stubs/compute.pyi | 1868 +++++++++++++++++++++++++++- 3 files changed, 1849 insertions(+), 39 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 11887c06aa4..9a56134133f 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -1,4 +1,4 @@ -# mypy: disable-error-code="overload-overlap,misc" +# mypy: disable-error-code="overload-overlap,misc,type-arg" import collections.abc import datetime as dt @@ -281,6 +281,10 @@ def scalar( value: Decimal, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None ) -> Decimal128Scalar: ... @overload +def scalar( + value: dt.datetime, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +) -> TimestampScalar: ... +@overload def scalar( value: dt.date, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None ) -> Date32Scalar: ... @@ -349,6 +353,13 @@ def scalar( memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Decimal128Type]]: ... @overload +def scalar( + value: CollectionValue[dt.datetime], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.TimestampType]]: ... +@overload def scalar( value: CollectionValue[dt.date], *, diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index a9ef5ae4eb5..a84f4a9b9e3 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -1,4 +1,4 @@ -# mypy: disable-error-code="overload-overlap" +# mypy: disable-error-code="overload-overlap,type-arg,misc" import datetime as dt @@ -162,6 +162,11 @@ def chunked_array( type: None = None, ) -> ChunkedArray[scalar.StructScalar]: ... @overload +def chunked_array( + values: NullableIterable[dt.datetime], + type: None = None, +) -> ChunkedArray[scalar.TimestampScalar]: ... +@overload def chunked_array( values: NullableIterable[dt.date], type: None = None, diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 593d9f614cc..9679e729408 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1,4 +1,7 @@ -from typing import Literal, Sequence, TypeVar, overload +# mypy: disable-error-code="misc,type-var,var-annotated" +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec +from collections.abc import Callable # Option classes from pyarrow._compute import ArraySortOptions as ArraySortOptions @@ -79,66 +82,1671 @@ from pyarrow._compute import register_aggregate_function as register_aggregate_f from pyarrow._compute import register_scalar_function as register_scalar_function from pyarrow._compute import register_tabular_function as register_tabular_function from pyarrow._compute import register_vector_function as register_vector_function -from pyarrow._stubs_typing import Indices from . import lib +import typing_extensions -def cast( - arr: lib.Array, - target_type: str | lib.DataType, - safe: bool = True, - options: CastOptions | None = None, +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... +def scalar(value: bool | float | str) -> Expression: ... +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +NumericScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] + | lib.Scalar[lib.Uint8Type] + | lib.Scalar[lib.Uint16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.Uint64Type] + | lib.Scalar[lib.Float16Type] + | lib.Scalar[lib.Float32Type] + | lib.Scalar[lib.Float64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] + | lib.LargeListScalar[_DataTypeT] + | lib.ListViewScalar[_DataTypeT] + | lib.LargeListViewScalar[_DataTypeT] + | lib.FixedSizeListScalar[_DataTypeT, Any] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar + | lib.Time64Scalar + | lib.TimestampScalar + | lib.DurationScalar + | lib.MonthDayNanoIntervalScalar +) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar +_NumericOrTemporalT = TypeVar("_NumericOrTemporalT", bound=NumericOrTemporalScalar) +NumericArray: TypeAlias = lib.NumericArray +_NumericArrayT = TypeVar("_NumericArrayT", bound=lib.NumericArray) +NumericOrDurationArray: TypeAlias = lib.NumericArray | lib.Array[lib.DurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = lib.NumericArray | lib.Array[TemporalScalar] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +FloatScalar: typing_extensions.TypeAlias = ( + lib.Scalar[lib.Float32Type] + | lib.Scalar[lib.Float64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: typing_extensions.TypeAlias = ( + lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] + | lib.NumericArray[lib.Decimal128Scalar] + | lib.NumericArray[lib.Decimal256Scalar] +) +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = lib.StringArray | lib.LargeStringArray +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = lib.BinaryArray | lib.LargeBinaryArray +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ( + lib.Date32Array + | lib.Date64Array + | lib.Time32Array + | lib.Time64Array + | lib.TimestampArray + | lib.DurationArray + | lib.MonthDayNanoIntervalArray +) +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar) +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | lib.BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... + +any = _clone_signature(all) + +def approximate_median( + array: NumericScalar | lib.NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... +def count( + array: lib.Array, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: ... +) -> lib.Int64Scalar: ... +def count_distinct( + array: lib.Array, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +def first( + array: lib.Array[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... +def first_last( + array: lib.Array, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... def index( data: lib.Array, - value: lib.Scalar, + value, start: int | None = None, end: int | None = None, *, memory_pool: lib.MemoryPool | None = None, -) -> int: ... +) -> lib.Int64Scalar: ... -_DataT = TypeVar("_DataT", bound=lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table) +last = _clone_signature(first) +max = _clone_signature(first) +min = _clone_signature(first) +min_max = _clone_signature(first_last) -def take( - data: _DataT, - indices: Indices, +def mean( + array: NumericScalar | lib.NumericArray, + /, *, - boundscheck: bool = True, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar | lib.Decimal128Scalar: ... +def mode( + array: NumericScalar | lib.NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls=True, + min_count=1, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... +def quantile( + array: NumericScalar | lib.NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +def stddev( + array: NumericScalar | lib.NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... +def sum( + array: _NumericScalarT | lib.NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +def tdigest( + array: NumericScalar | lib.NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +def variance( + array: NumericScalar | lib.NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _DataT: ... -def fill_null(values: _DataT, fill_value: lib.Array | lib.ChunkedArray | lib.Scalar) -> _DataT: ... +) -> lib.DoubleScalar: ... + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= @overload -def top_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch, - k: int, +def abs( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def abs( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... + +abs_checked = _clone_signature(abs) + +@overload +def add( + x: _NumericOrTemporalT, y: _NumericOrTemporalT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrTemporalT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: _NumericOrTemporalArrayT, + /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: ... +) -> _NumericOrTemporalArrayT: ... @overload -def top_k_unstable( - values: lib.Table, - k: int, - sort_keys: Sequence[str], +def add( + x: NumericScalar, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> NumericScalar: ... +@overload +def add( + x: TemporalScalar, y: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> TemporalScalar: ... +@overload +def add( + x: NumericOrTemporalArray | NumericOrTemporalScalar, + y: NumericOrTemporalArray | NumericOrTemporalScalar, + /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: ... +) -> NumericOrTemporalArray: ... + +add_checked = _clone_signature(add) + @overload -def bottom_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch, - k: int, +def divide( + dividend: NumericScalar, + divisor: NumericScalar, + /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: ... +) -> NumericScalar: ... @overload -def bottom_k_unstable( - values: lib.Table, - k: int, - sort_keys: Sequence[str], +def divide( + dividend: TemporalScalar, + divisor: TemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> TemporalScalar: ... +@overload +def divide( + dividend: NumericOrTemporalArray | NumericOrTemporalScalar, + divisor: NumericOrTemporalArray | NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> NumericArray: ... + +divide_checked = _clone_signature(divide) + +@overload +def exp( + exponent: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatArray | lib.DoubleArray: ... +@overload +def exp( + exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... + +multiply = _clone_signature(add) +multiply_checked = _clone_signature(multiply) + +@overload +def negate( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def negate( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... + +negate_checked = _clone_signature(negate) + +@overload +def power( + base: _NumericScalarT, + exponent: _NumericScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def power( + base: NumericScalar, exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> NumericScalar: ... +@overload +def power( + base: _NumericArrayT, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar | NumericArray, + exponent: NumericScalar | NumericArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> NumericArray: ... + +power_checked = _clone_signature(power) + +@overload +def sign( + x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] +): ... +@overload +def sign( + x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... +@overload +def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... +@overload +def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... + +sqrt_checked = _clone_signature(sqrt) + +subtract = _clone_signature(add) +subtract_checked = _clone_signature(subtract) + +# ========================= 2.1 Bit-wise functions ========================= +@overload +def bit_wise_and( + x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, + y: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: NumericScalar, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> NumericScalar: ... +@overload +def bit_wise_and( + x: NumericArray | NumericScalar, + y: NumericArray | NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> NumericArray: ... +@overload +def bit_wise_not( + x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_not( + x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... + +bit_wise_or = _clone_signature(bit_wise_and) +bit_wise_xor = _clone_signature(bit_wise_and) +shift_left = _clone_signature(bit_wise_and) +shift_left_checked = _clone_signature(bit_wise_and) +shift_right = _clone_signature(bit_wise_and) +shift_right_checked = _clone_signature(bit_wise_and) + +# ========================= 2.2 Rounding functions ========================= +@overload +def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... +@overload +def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... + +floor = _clone_signature(ceil) + +@overload +def round( + x: _NumericScalarT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round( + x: _NumericArrayT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_to_multiple( + x: _NumericScalarT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_to_multiple( + x: _NumericArrayT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_binary( + x: _NumericScalarT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_binary( + x: _NumericScalarT, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[_NumericScalarT]: ... +@overload +def round_binary( + x: _NumericArrayT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... + +trunc = _clone_signature(ceil) + +# ========================= 2.3 Logarithmic functions ========================= +@overload +def ln( + x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def ln( + x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... + +ln_checked = _clone_signature(ln) +log10 = _clone_signature(ln) +log10_checked = _clone_signature(ln) +log1p = _clone_signature(ln) +log1p_checked = _clone_signature(ln) +log2 = _clone_signature(ln) +log2_checked = _clone_signature(ln) + +@overload +def logb( + x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def logb( + x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatScalar | FloatArray, + b: FloatScalar | FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +def logb( + x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... + +logb_checked = _clone_signature(logb) + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +acos_checked = _clone_signature(ln) +asin = _clone_signature(ln) +asin_checked = _clone_signature(ln) +atan = _clone_signature(ln) +cos = _clone_signature(ln) +cos_checked = _clone_signature(ln) +sin = _clone_signature(ln) +sin_checked = _clone_signature(ln) +tan = _clone_signature(ln) +tan_checked = _clone_signature(ln) + +@overload +def atan2( + y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def atan2( + y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatScalar | FloatArray, + x: FloatScalar | FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +def atan2( + y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... + +# ========================= 2.5 Comparisons functions ========================= +@overload +def equal( + x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def equal( + x: lib.Scalar | lib.Array, + y: lib.Scalar | lib.Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +greater = _clone_signature(equal) +greater_equal = _clone_signature(equal) +less = _clone_signature(equal) +less_equal = _clone_signature(equal) +not_equal = _clone_signature(equal) + +@overload +def max_element_wise( + *args: _ScalarT, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... +@overload +def max_element_wise( + *args: _ArrayT, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... + +min_element_wise = _clone_signature(equal) + +# ========================= 2.6 Logical functions ========================= +@overload +def and_( + x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def and_( + x: lib.BooleanScalar | lib.BooleanArray, + y: lib.BooleanScalar | lib.BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +and_kleene = _clone_signature(and_) +and_not = _clone_signature(and_) +and_not_kleene = _clone_signature(and_) +or_ = _clone_signature(and_) +or_kleene = _clone_signature(and_) +xor = _clone_signature(and_) + +@overload +def invert( + x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def invert( + x: lib.BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +# ========================= 2.10 String predicates ========================= +@overload +def ascii_is_alnum( + strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def ascii_is_alnum( + strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +ascii_is_decimal = _clone_signature(ascii_is_alnum) +ascii_is_lower = _clone_signature(ascii_is_alnum) +ascii_is_printable = _clone_signature(ascii_is_alnum) +ascii_is_space = _clone_signature(ascii_is_alnum) +ascii_is_upper = _clone_signature(ascii_is_alnum) +utf8_is_alnum = _clone_signature(ascii_is_alnum) +utf8_is_alpha = _clone_signature(ascii_is_alnum) +utf8_is_decimal = _clone_signature(ascii_is_alnum) +utf8_is_digit = _clone_signature(ascii_is_alnum) +utf8_is_lower = _clone_signature(ascii_is_alnum) +utf8_is_numeric = _clone_signature(ascii_is_alnum) +utf8_is_printable = _clone_signature(ascii_is_alnum) +utf8_is_space = _clone_signature(ascii_is_alnum) +utf8_is_upper = _clone_signature(ascii_is_alnum) +ascii_is_title = _clone_signature(ascii_is_alnum) +utf8_is_title = _clone_signature(ascii_is_alnum) +string_is_ascii = _clone_signature(ascii_is_alnum) + +# ========================= 2.11 String transforms ========================= +@overload +def ascii_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def ascii_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... + +ascii_lower = _clone_signature(ascii_capitalize) +ascii_reverse = _clone_signature(ascii_capitalize) +ascii_swapcase = _clone_signature(ascii_capitalize) +ascii_title = _clone_signature(ascii_capitalize) +ascii_upper = _clone_signature(ascii_capitalize) + +@overload +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def binary_length( + strings: lib.LargeBinaryScalar | lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def binary_length( + strings: lib.BinaryArray | lib.StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Array: ... +@overload +def binary_length( + strings: lib.LargeBinaryArray | lib.LargeStringArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: int, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[_StringOrBinaryScalarT]: ... +@overload +def binary_repeat( + strings: _StringOrBinaryArrayT, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_replace_slice( + strings: _StringOrBinaryScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_replace_slice( + strings: _StringOrBinaryArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_reverse( + strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT: ... +@overload +def binary_reverse( + strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryArrayT: ... +@overload +def replace_substring( + strings: _StringScalarT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def replace_substring( + strings: _StringArrayT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +replace_substring_regex = _clone_signature(replace_substring) + +@overload +def utf8_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def utf8_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def utf8_length( + strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def utf8_length( + strings: lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def utf8_length( + strings: lib.StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Array: ... +@overload +def utf8_length( + strings: lib.LargeStringArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... + +utf8_lower = _clone_signature(utf8_capitalize) + +@overload +def utf8_replace_slice( + strings: _StringScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_replace_slice( + strings: _StringArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +utf8_reverse = _clone_signature(utf8_capitalize) +utf8_swapcase = _clone_signature(utf8_capitalize) +utf8_title = _clone_signature(utf8_capitalize) +utf8_upper = _clone_signature(utf8_capitalize) + +# ========================= 2.12 String padding ========================= +@overload +def ascii_center( + strings: _StringScalarT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_center( + strings: _StringArrayT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +ascii_lpad = _clone_signature(ascii_center) +ascii_rpad = _clone_signature(ascii_center) +utf8_center = _clone_signature(ascii_center) +utf8_lpad = _clone_signature(ascii_center) +utf8_rpad = _clone_signature(ascii_center) + +# ========================= 2.13 String trimming ========================= +@overload +def ascii_ltrim( + strings: _StringScalarT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim( + strings: _StringArrayT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +ascii_rtrim = _clone_signature(ascii_ltrim) +ascii_trim = _clone_signature(ascii_ltrim) +utf8_ltrim = _clone_signature(ascii_ltrim) +utf8_rtrim = _clone_signature(ascii_ltrim) +utf8_trim = _clone_signature(ascii_ltrim) + +@overload +def ascii_ltrim_whitespace( + strings: _StringScalarT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim_whitespace( + strings: _StringArrayT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) + +# ========================= 2.14 String splitting ========================= +@overload +def ascii_split_whitespace( + strings: _StringScalarT, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT]: ... +@overload +def ascii_split_whitespace( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def split_pattern( + strings: _StringOrBinaryScalarT, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT]: ... +@overload +def split_pattern( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... + +split_pattern_regex = _clone_signature(split_pattern) +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) + +# ========================= 2.15 String component extraction ========================= +@overload +def extract_regex( + strings: StringOrBinaryScalar, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def extract_regex( + strings: StringOrBinaryArray, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: ... +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryArrayT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... + +# ========================= 2.17 String Slicing ========================= +@overload +def binary_slice( + strings: _BinaryScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT: ... +@overload +def binary_slice( + strings: _BinaryArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryArrayT: ... +@overload +def utf8_slice_codeunits( + strings: _StringScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_slice_codeunits( + strings: _StringArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... + +# ========================= 2.18 Containment tests ========================= +@overload +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def count_substring( + strings: lib.LargeStringScalar | lib.LargeBinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def count_substring( + strings: lib.StringArray | lib.BinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def count_substring( + strings: lib.LargeStringArray | lib.LargeBinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... + +count_substring_regex = _clone_signature(count_substring) + +@overload +def ends_with( + strings: StringScalar | BinaryScalar, + /, + pattern: str, *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: ... +) -> lib.BooleanScalar: ... +@overload +def ends_with( + strings: StringArray | BinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +find_substring = _clone_signature(count_substring) +find_substring_regex = _clone_signature(count_substring) + +@overload +def index_in( + values: lib.Scalar, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def index_in( + values: lib.Array, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def is_in( + values: lib.Scalar, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_in( + values: lib.Array, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +match_like = _clone_signature(ends_with) +match_substring = _clone_signature(ends_with) +match_substring_regex = _clone_signature(ends_with) +starts_with = _clone_signature(ends_with) + +# ========================= 2.19 Categorizations ========================= +@overload +def is_finite( + values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_finite( + values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... + +is_inf = _clone_signature(is_finite) +is_nan = _clone_signature(is_finite) + +@overload +def is_null( + values: lib.Scalar, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_null( + values: lib.Array, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_valid( + values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_valid( + values: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... + +true_unless_null = _clone_signature(is_valid) + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): ... +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: ... +def if_else(cond, left, right, /, *, memory_pool: lib.MemoryPool | None = None): ... + +# ========================= 2.21 Structural transforms ========================= + +@overload +def list_value_length( + lists: lib.ListArray | lib.ListViewArray | lib.FixedSizeListArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def list_value_length( + lists: lib.LargeListArray | lib.LargeListViewArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def make_struct( + *args: lib.Scalar, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def make_struct( + *args: lib.Array, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... + +# ========================= 2.22 Conversions ========================= +@overload +def ceil_temporal( + timestamps: _TemporalScalarT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT: ... +@overload +def ceil_temporal( + timestamps: _TemporalArrayT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalArrayT: ... + +floor_temporal = _clone_signature(ceil_temporal) +round_temporal = _clone_signature(ceil_temporal) + +@overload +def cast( + arr: lib.Scalar, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT]: ... +@overload +def cast( + arr: lib.Array, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def strftime( + timestamps: TemporalScalar, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar: ... +@overload +def strftime( + timestamps: TemporalArray, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringArray: ... +@overload +def strptime( + strings: StringScalar, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def strptime( + strings: StringArray, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... + +# ========================= 2.23 Temporal component extraction ========================= +@overload +def day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +@overload +def day_of_week( + values: TemporalScalar, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def day_of_week( + values: TemporalArray, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... + +day_of_year = _clone_signature(day) + +@overload +def hour( + values: lib.TimestampScalar | lib.Time32Scalar | lib.Time64Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def hour( + values: lib.TimestampArray | lib.Time32Array | lib.Time64Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def is_dst( + values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_dst( + values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def iso_week( + values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def iso_week( + values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... + +iso_year = _clone_signature(iso_week) + +@overload +def is_leap_year( + values: lib.TimestampScalar | lib.Date32Scalar | lib.Date64Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_leap_year( + values: lib.TimestampArray | lib.Date32Array | lib.Date64Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... + +microsecond = _clone_signature(iso_week) +millisecond = _clone_signature(iso_week) +minute = _clone_signature(iso_week) +month = _clone_signature(day_of_week) +nanosecond = _clone_signature(hour) +quarter = _clone_signature(day_of_week) +second = _clone_signature(hour) +subsecond = _clone_signature(hour) +us_week = _clone_signature(iso_week) +us_year = _clone_signature(iso_week) +year = _clone_signature(iso_week) + +@overload +def week( + values: lib.TimestampScalar, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def week( + values: lib.TimestampArray, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def year_month_day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar: ... +@overload +def year_month_day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: ... + +hours_between = _clone_signature(days_between) +microseconds_between = _clone_signature(days_between) +milliseconds_between = _clone_signature(days_between) +minutes_between = _clone_signature(days_between) + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... + +nanoseconds_between = _clone_signature(days_between) +quarters_between = _clone_signature(days_between) +seconds_between = _clone_signature(days_between) + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: ... + +years_between = _clone_signature(days_between) + +# ========================= 2.25 Timezone handling ========================= +@overload +def assume_timezone( + timestamps: lib.TimestampScalar, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def assume_timezone( + timestamps: lib.TimestampArray, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def local_timestamp( + timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar: ... +@overload +def local_timestamp( + timestamps: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampArray: ... + +# ========================= 2.26 Random number generation ========================= def random( n: int, *, @@ -146,5 +1754,191 @@ def random( options: RandomOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleArray: ... -def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... -def scalar(value: bool | float | str) -> Expression: ... + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= + +def cumulative_sum( + values: _NumericArrayT, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... + +cumulative_sum_checked = _clone_signature(cumulative_sum) +cumulative_prod = _clone_signature(cumulative_sum) +cumulative_prod_checked = _clone_signature(cumulative_sum) +cumulative_max = _clone_signature(cumulative_sum) +cumulative_min = _clone_signature(cumulative_sum) +cumulative_mean = _clone_signature(cumulative_sum) + +# ========================= 3.2 Associative transforms ========================= + +def dictionary_encode( + array: _ScalarOrArrayT, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT: ... +def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +def value_counts( + array: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... + +# ========================= 3.3 Selections ========================= + +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | lib.BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +def array_take( + array: _ArrayT, + indices: list[int] | list[int | None] | lib.Int16Array | lib.Int32Array | lib.Int64Array, + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... + +filter = array_filter +take = array_take + +# ========================= 3.4 Containment tests ========================= + +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... + +# ========================= 3.5 Sorts and partitions ========================= +def array_sort_indices( + array: lib.Array, + /, + order: Literal["ascending", "descending"] = "ascending", + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +def partition_nth_indices( + array: lib.Array, + /, + pivot: int, + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +def rank( + input: lib.Array, + /, + sort_keys: Literal["ascending", "descending"] = "ascending", + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +def select_k_unstable( + input: lib.Array, + /, + k: int, + sort_keys: list[tuple[str, Literal["ascending", "descending"]]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +def sort_indices( + array: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + /, + order: Literal["ascending", "descending"] = "ascending", + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... + +# ========================= 3.6 Structural transforms ========================= +def list_element( + lists, index, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.ListArray: ... +def list_flatten( + lists, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray: ... +def list_parent_indices( + lists, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +def list_slice( + lists, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray: ... +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | lib.BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): ... + +# ========================= 3.7 Pairwise functions ========================= +def pairwise_diff( + input: _NumericOrTemporalArrayT, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... + +pairwise_diff_checked = _clone_signature(pairwise_diff) From 055ea6d2bc83611c0c31a948aeda65a3a2403b09 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 4 Sep 2024 15:35:38 +0800 Subject: [PATCH 064/231] support build array with list of scalar (#64) --- pyarrow-stubs/__lib_pxi/array.pyi | 12 +++++++++++- pyarrow-stubs/__lib_pxi/table.pyi | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 5b4fd38f78c..5888e837512 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1,4 +1,4 @@ -# mypy: disable-error-code="overload-overlap,misc" +# mypy: disable-error-code="overload-overlap,misc,type-arg" import datetime as dt @@ -173,6 +173,16 @@ def array( memory_pool: MemoryPool | None = None, ) -> ListArray: ... @overload +def array( + values: NullableIterable[_Scalar_CoT], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[_Scalar_CoT]: ... +@overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: _DataTypeT, diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index a84f4a9b9e3..870552d44f9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -202,6 +202,11 @@ def chunked_array( type: None = None, ) -> ChunkedArray[scalar.ListScalar]: ... @overload +def chunked_array( + values: NullableIterable[_Scalar_CoT], + type: None = None, +) -> ChunkedArray[_Scalar_CoT]: ... +@overload def chunked_array( values: Iterable | SupportArrowStream | SupportArrowArray, type: _DataTypeT, From 1f1b2dffcb6ee75c0350adab15b517ae01b24c0f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 4 Sep 2024 15:37:21 +0800 Subject: [PATCH 065/231] release 2024.9.4 (#65) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index ac44af0b7c6..49e0873565d 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: 2024.9.3 + version: 2024.9.4 path: . - sha256: 47fcdf5b2b7ce4a312b108a2f2fb17749cdd695ff76dad2830db5d885175e0f8 + sha256: 91210263f15bd6586505998148944929b2ebb5a1fd3b9754ffcacf35ebbd61e3 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index ff2593f40d9..2b055700290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "2024.9.3" +version = "2024.9.4" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 79b39a5870c7d9046234691584ebf25004034922 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 4 Sep 2024 17:12:54 +0800 Subject: [PATCH 066/231] Version follows the version of pyarrow (#66) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 49e0873565d..a4b6f794ad3 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: 2024.9.4 + version: '17.0' path: . - sha256: 91210263f15bd6586505998148944929b2ebb5a1fd3b9754ffcacf35ebbd61e3 + sha256: 3eb6559480f006ce275008e5cb7c66dbbe02d1f06ee3d43db2b848f8d729f9a7 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 2b055700290..6918ea72343 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "2024.9.4" +version = "17.0" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 69ee2a1684448eaef338d239b2ccf8b431713ea2 Mon Sep 17 00:00:00 2001 From: fvankrieken Date: Wed, 4 Sep 2024 20:06:20 -0400 Subject: [PATCH 067/231] import parquet.core into parquet __init__.py (#67) Update __init__.pyi --- pyarrow-stubs/parquet/__init__.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/parquet/__init__.pyi b/pyarrow-stubs/parquet/__init__.pyi index e69de29bb2d..4ef88705809 100644 --- a/pyarrow-stubs/parquet/__init__.pyi +++ b/pyarrow-stubs/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * # noqa From 2aad9048662e6e13eddc65b9084db66eb03e0436 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 5 Sep 2024 08:10:07 +0800 Subject: [PATCH 068/231] release 17.1 (#69) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index a4b6f794ad3..ad9e11e7b83 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.0' + version: '17.1' path: . - sha256: 3eb6559480f006ce275008e5cb7c66dbbe02d1f06ee3d43db2b848f8d729f9a7 + sha256: c582e2fa0461b080db71e3b934fab51048a58192a424f5d0e7c3db0b32541141 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 6918ea72343..c930a9faffe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.0" +version = "17.1" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 1b4b5f23a29deef9bec92a6bc9cc41106ab99ed6 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 6 Sep 2024 09:07:04 +0800 Subject: [PATCH 069/231] fix: add missing submodule benchmark, csv and cuda (#71) --- pyarrow-stubs/benchmark.pyi | 3 +++ pyarrow-stubs/csv.pyi | 27 +++++++++++++++++++++++++++ pyarrow-stubs/cuda.pyi | 25 +++++++++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 pyarrow-stubs/benchmark.pyi create mode 100644 pyarrow-stubs/csv.pyi create mode 100644 pyarrow-stubs/cuda.pyi diff --git a/pyarrow-stubs/benchmark.pyi b/pyarrow-stubs/benchmark.pyi new file mode 100644 index 00000000000..048973301dc --- /dev/null +++ b/pyarrow-stubs/benchmark.pyi @@ -0,0 +1,3 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull + +__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/pyarrow-stubs/csv.pyi b/pyarrow-stubs/csv.pyi new file mode 100644 index 00000000000..510229d7e72 --- /dev/null +++ b/pyarrow-stubs/csv.pyi @@ -0,0 +1,27 @@ +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/pyarrow-stubs/cuda.pyi b/pyarrow-stubs/cuda.pyi new file mode 100644 index 00000000000..e11baf7d4e7 --- /dev/null +++ b/pyarrow-stubs/cuda.pyi @@ -0,0 +1,25 @@ +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] From be1a8a5949a63ba7059748fc908581c5900c0cbb Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 6 Sep 2024 09:12:08 +0800 Subject: [PATCH 070/231] release 17.2 (#72) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index ad9e11e7b83..44c77413767 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.1' + version: '17.2' path: . - sha256: c582e2fa0461b080db71e3b934fab51048a58192a424f5d0e7c3db0b32541141 + sha256: 503cfd405c481b34184803ae46292fe5d0bf899941890f3fd074a44f96fcec40 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index c930a9faffe..04399dacf8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.1" +version = "17.2" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 0156025b8f592b3daaec42cff57e2cab7f3b8aca Mon Sep 17 00:00:00 2001 From: Ilia Ablamonov Date: Mon, 9 Sep 2024 14:34:06 +0200 Subject: [PATCH 071/231] fix: from_pylist covariance (#73) --- pyarrow-stubs/__lib_pxi/table.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 870552d44f9..35b1f48b75e 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -12,6 +12,7 @@ from typing import ( Literal, Mapping, Self, + Sequence, TypeAlias, TypeVar, overload, @@ -411,7 +412,7 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): @classmethod def from_pylist( cls, - mapping: list[Mapping[str, Any]], + mapping: Sequence[Mapping[str, Any]], schema: Schema | None = None, metadata: Mapping | None = None, ) -> Self: ... From 834881c365dbff0ee22f64c2fcddb24b2d3386e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 06:40:04 +0800 Subject: [PATCH 072/231] [pre-commit.ci] pre-commit autoupdate (#74) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc6a3c31ae7..90bda4c75ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.3 + rev: v0.6.4 hooks: - id: ruff args: [--fix] From 6f24e8f7156a6fee0f5033eec1a4c03ac0ee9d4a Mon Sep 17 00:00:00 2001 From: Eugene Toder Date: Tue, 10 Sep 2024 21:37:02 -0400 Subject: [PATCH 073/231] Fix return type for middleware factory's start_call (#75) It can return None if middleware is not needed for a given call. --- pyarrow-stubs/_flight.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index ed5b38b619d..262715c50f4 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -344,7 +344,7 @@ class CallInfo(NamedTuple): method: FlightMethod class ClientMiddlewareFactory(_Weakrefable): - def start_call(self, info: CallInfo) -> ClientMiddleware: ... + def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... class ClientMiddleware(_Weakrefable): def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... @@ -354,7 +354,7 @@ class ClientMiddleware(_Weakrefable): class ServerMiddlewareFactory(_Weakrefable): def start_call( self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> ServerMiddleware: ... + ) -> ServerMiddleware | None: ... class TracingServerMiddlewareFactory(ServerMiddlewareFactory): ... From c9a85b8841aeb561a2e2fc32f2bee78ca1c2f6de Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 11 Sep 2024 09:49:54 +0800 Subject: [PATCH 074/231] release 17.3 (#76) --- pixi.lock | 42 +++++++++++++++++++++--------------------- pyproject.toml | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pixi.lock b/pixi.lock index 44c77413767..4e0003c0ef2 100644 --- a/pixi.lock +++ b/pixi.lock @@ -278,12 +278,12 @@ packages: sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 requires_dist: - six>=1.12.0 - - typing ; python_version < '3.5' - - astroid<2,>=1 ; python_version < '3' and extra == 'astroid' - - astroid<4,>=2 ; python_version >= '3' and extra == 'astroid' + - typing ; python_full_version < '3.5' + - astroid<2,>=1 ; python_full_version < '3' and extra == 'astroid' + - astroid<4,>=2 ; python_full_version >= '3' and extra == 'astroid' - pytest ; extra == 'test' - - astroid<2,>=1 ; python_version < '3' and extra == 'test' - - astroid<4,>=2 ; python_version >= '3' and extra == 'test' + - astroid<2,>=1 ; python_full_version < '3' and extra == 'test' + - astroid<4,>=2 ; python_full_version >= '3' and extra == 'test' - kind: conda name: bzip2 version: 1.0.8 @@ -434,7 +434,7 @@ packages: - coverage ; extra == 'tests' - coverage-enable-subprocess ; extra == 'tests' - littleutils ; extra == 'tests' - - rich ; python_version >= '3.11' and extra == 'tests' + - rich ; python_full_version >= '3.11' and extra == 'tests' requires_python: '>=3.5' - kind: pypi name: filelock @@ -454,7 +454,7 @@ packages: - pytest-timeout>=2.2 ; extra == 'testing' - pytest>=7.4.3 ; extra == 'testing' - virtualenv>=20.26.2 ; extra == 'testing' - - typing-extensions>=4.8 ; python_version < '3.11' and extra == 'typing' + - typing-extensions>=4.8 ; python_full_version < '3.11' and extra == 'typing' requires_python: '>=3.8' - kind: pypi name: hatchling @@ -465,7 +465,7 @@ packages: - packaging>=23.2 - pathspec>=0.10.1 - pluggy>=1.0.0 - - tomli>=1.2.2 ; python_version < '3.11' + - tomli>=1.2.2 ; python_full_version < '3.11' - trove-classifiers requires_python: '>=3.8' - kind: pypi @@ -489,9 +489,9 @@ packages: - pygments>=2.4.0 - stack-data - traitlets>=5.13.0 - - exceptiongroup ; python_version < '3.11' - - typing-extensions>=4.6 ; python_version < '3.12' - - pexpect>4.3 ; sys_platform != 'win32' and sys_platform != 'emscripten' + - exceptiongroup ; python_full_version < '3.11' + - typing-extensions>=4.6 ; python_full_version < '3.12' + - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' - colorama ; sys_platform == 'win32' - ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole] ; extra == 'all' - ipython[test,test-extra] ; extra == 'all' @@ -507,7 +507,7 @@ packages: - sphinx>=1.3 ; extra == 'doc' - sphinxcontrib-jquery ; extra == 'doc' - typing-extensions ; extra == 'doc' - - tomli ; python_version < '3.11' and extra == 'doc' + - tomli ; python_full_version < '3.11' and extra == 'doc' - ipykernel ; extra == 'kernel' - matplotlib ; extra == 'matplotlib' - nbconvert ; extra == 'nbconvert' @@ -938,7 +938,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -952,7 +952,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -966,7 +966,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -980,7 +980,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.2' + version: '17.3' path: . - sha256: 503cfd405c481b34184803ae46292fe5d0bf899941890f3fd074a44f96fcec40 + sha256: 0e4733249cd6fd32b3b463efcb84c9bc647f78b612c275150dbe6480ced52fa8 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' @@ -1892,7 +1892,7 @@ packages: requires_dist: - distlib<1,>=0.3.7 - filelock<4,>=3.12.2 - - importlib-metadata>=6.6 ; python_version < '3.8' + - importlib-metadata>=6.6 ; python_full_version < '3.8' - platformdirs<5,>=3.9.1 - furo>=2023.7.26 ; extra == 'docs' - proselint>=0.13 ; extra == 'docs' @@ -1906,7 +1906,7 @@ packages: - flaky>=3.7 ; extra == 'test' - packaging>=23.1 ; extra == 'test' - pytest-env>=0.8.2 ; extra == 'test' - - pytest-freezer>=0.4.8 ; (platform_python_implementation == 'PyPy' or (platform_python_implementation == 'CPython' and sys_platform == 'win32' and python_version >= '3.13')) and extra == 'test' + - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') - pytest-mock>=3.11.1 ; extra == 'test' - pytest-randomly>=3.12 ; extra == 'test' - pytest-timeout>=2.1 ; extra == 'test' @@ -1936,7 +1936,7 @@ packages: url: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 requires_dist: - - backports-functools-lru-cache>=1.2.1 ; python_version < '3.2' + - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' - kind: conda name: xz version: 5.2.6 diff --git a/pyproject.toml b/pyproject.toml index 04399dacf8d..1808106883c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.2" +version = "17.3" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 8996deb7d6d865b23b0658075f5efba905a4f1c0 Mon Sep 17 00:00:00 2001 From: Mathias Beguin Date: Thu, 12 Sep 2024 03:51:26 +0200 Subject: [PATCH 075/231] fix: add missing return type in FlightDescriptor static methods (#80) --- pyarrow-stubs/_flight.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 262715c50f4..d52ecfd7957 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -103,9 +103,9 @@ class FlightMethod(enum.Enum): class FlightDescriptor(_Weakrefable): @staticmethod - def for_path(*path: str | bytes): ... + def for_path(*path: str | bytes) -> FlightDescriptor: ... @staticmethod - def for_command(command: str | bytes): ... + def for_command(command: str | bytes) -> FlightDescriptor: ... @property def descriptor_type(self) -> DescriptorType: ... @property From 0e0ea783630d9275f87d7341a35b7c93f007f9f5 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 12 Sep 2024 10:04:24 +0800 Subject: [PATCH 076/231] Support Tabular filter with Expression (#81) support Tabular filter with Expression --- pyarrow-stubs/__lib_pxi/table.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 35b1f48b75e..684b8f2fabd 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -32,6 +32,7 @@ from pyarrow._stubs_typing import ( SupportArrowDeviceArray, SupportArrowStream, ) +from pyarrow.compute import Expression from pyarrow.interchange.dataframe import _PyArrowDataFrame from pyarrow.lib import Field, MemoryPool, MonthDayNano, Schema @@ -430,7 +431,7 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def sort_by(self, sorting: Order | list[tuple[str, Order]], **kwargs) -> Self: ... def take(self, indices: Indices) -> Self: ... def filter( - self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop" + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" ) -> Self: ... def to_pydict(self) -> dict[str, list]: ... def to_pylist(self) -> list[dict[str, Any]]: ... From 770b0440ed001100a3d80bc030044dc7413a6e69 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 12 Sep 2024 11:04:22 +0800 Subject: [PATCH 077/231] Support compute functions to accept Expression as parameter (#82) --- pyarrow-stubs/compute.pyi | 637 +++++++++++++++++++++++++++++++++++++- 1 file changed, 630 insertions(+), 7 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 9679e729408..7ea12446b51 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -346,6 +346,8 @@ def abs( def abs( x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _NumericOrDurationArrayT: ... +@overload +def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... abs_checked = _clone_signature(abs) @@ -377,6 +379,10 @@ def add( *, memory_pool: lib.MemoryPool | None = None, ) -> NumericOrTemporalArray: ... +@overload +def add( + x: Expression | Any, y: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... add_checked = _clone_signature(add) @@ -404,6 +410,14 @@ def divide( *, memory_pool: lib.MemoryPool | None = None, ) -> NumericArray: ... +@overload +def divide( + dividend: Expression | Any, + divisor: Expression | Any, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... divide_checked = _clone_signature(divide) @@ -415,6 +429,8 @@ def exp( def exp( exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... multiply = _clone_signature(add) multiply_checked = _clone_signature(multiply) @@ -427,6 +443,8 @@ def negate( def negate( x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _NumericOrDurationArrayT: ... +@overload +def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... negate_checked = _clone_signature(negate) @@ -458,6 +476,14 @@ def power( *, memory_pool: lib.MemoryPool | None = None, ) -> NumericArray: ... +@overload +def power( + base: Expression | Any, + exponent: Expression | Any, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... power_checked = _clone_signature(power) @@ -474,9 +500,13 @@ def sign( x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... @overload +def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... @overload def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... +@overload +def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... sqrt_checked = _clone_signature(sqrt) @@ -509,6 +539,10 @@ def bit_wise_and( memory_pool: lib.MemoryPool | None = None, ) -> NumericArray: ... @overload +def bit_wise_and( + x: Expression | Any, y: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload def bit_wise_not( x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _NumericScalarT: ... @@ -516,6 +550,8 @@ def bit_wise_not( def bit_wise_not( x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _NumericArrayT: ... +@overload +def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... bit_wise_or = _clone_signature(bit_wise_and) bit_wise_xor = _clone_signature(bit_wise_and) @@ -529,6 +565,8 @@ shift_right_checked = _clone_signature(bit_wise_and) def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... @overload def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... +@overload +def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... floor = _clone_signature(ceil) @@ -575,6 +613,27 @@ def round( memory_pool: lib.MemoryPool | None = None, ) -> _NumericArrayT: ... @overload +def round( + x: Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def round_to_multiple( x: _NumericScalarT, /, @@ -617,6 +676,27 @@ def round_to_multiple( memory_pool: lib.MemoryPool | None = None, ) -> _NumericArrayT: ... @overload +def round_to_multiple( + x: Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def round_binary( x: _NumericScalarT, s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, @@ -679,6 +759,27 @@ def round_binary( options: RoundBinaryOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _NumericArrayT: ... +@overload +def round_binary( + x: Expression, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... trunc = _clone_signature(ceil) @@ -691,6 +792,8 @@ def ln( def ln( x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... ln_checked = _clone_signature(ln) log10 = _clone_signature(ln) @@ -716,9 +819,10 @@ def logb( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload def logb( - x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... + x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression | Any: ... logb_checked = _clone_signature(logb) @@ -751,9 +855,10 @@ def atan2( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload def atan2( - y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... + y: Expression | Any, x: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... # ========================= 2.5 Comparisons functions ========================= @overload @@ -768,6 +873,14 @@ def equal( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def equal( + x: Expression | Any, + y: Expression | Any, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... greater = _clone_signature(equal) greater_equal = _clone_signature(equal) @@ -789,6 +902,13 @@ def max_element_wise( options: ElementWiseAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _ArrayT: ... +@overload +def max_element_wise( + *args: Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... min_element_wise = _clone_signature(equal) @@ -805,6 +925,14 @@ def and_( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def and_( + x: Expression | Any, + y: Expression | Any, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... and_kleene = _clone_signature(and_) and_not = _clone_signature(and_) @@ -824,6 +952,13 @@ def invert( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def invert( + x: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.10 String predicates ========================= @overload @@ -834,6 +969,10 @@ def ascii_is_alnum( def ascii_is_alnum( strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanArray: ... +@overload +def ascii_is_alnum( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... ascii_is_alpha = _clone_signature(ascii_is_alnum) ascii_is_decimal = _clone_signature(ascii_is_alnum) @@ -863,6 +1002,10 @@ def ascii_capitalize( def ascii_capitalize( strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _StringArrayT: ... +@overload +def ascii_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... ascii_lower = _clone_signature(ascii_capitalize) ascii_reverse = _clone_signature(ascii_capitalize) @@ -893,6 +1036,13 @@ def binary_length( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload +def binary_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def binary_repeat( strings: _StringOrBinaryScalarT, num_repeats: int, @@ -917,6 +1067,14 @@ def binary_repeat( memory_pool: lib.MemoryPool | None = None, ) -> _StringOrBinaryArrayT: ... @overload +def binary_repeat( + strings: Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def binary_replace_slice( strings: _StringOrBinaryScalarT, /, @@ -939,6 +1097,17 @@ def binary_replace_slice( memory_pool: lib.MemoryPool | None = None, ) -> _StringOrBinaryArrayT: ... @overload +def binary_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def binary_reverse( strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _BinaryScalarT: ... @@ -947,6 +1116,10 @@ def binary_reverse( strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _BinaryArrayT: ... @overload +def binary_reverse( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload def replace_substring( strings: _StringScalarT, /, @@ -968,6 +1141,17 @@ def replace_substring( options: ReplaceSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def replace_substring( + strings: Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... replace_substring_regex = _clone_signature(replace_substring) @@ -980,6 +1164,10 @@ def utf8_capitalize( strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None ) -> _StringArrayT: ... @overload +def utf8_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload def utf8_length( strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int32Scalar: ... @@ -1001,6 +1189,13 @@ def utf8_length( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... +@overload +def utf8_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... utf8_lower = _clone_signature(utf8_capitalize) @@ -1026,6 +1221,17 @@ def utf8_replace_slice( options: ReplaceSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def utf8_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... utf8_reverse = _clone_signature(utf8_capitalize) utf8_swapcase = _clone_signature(utf8_capitalize) @@ -1055,6 +1261,17 @@ def ascii_center( options: PadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def ascii_center( + strings: Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... ascii_lpad = _clone_signature(ascii_center) ascii_rpad = _clone_signature(ascii_center) @@ -1081,6 +1298,15 @@ def ascii_ltrim( options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def ascii_ltrim( + strings: Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... ascii_rtrim = _clone_signature(ascii_ltrim) ascii_trim = _clone_signature(ascii_ltrim) @@ -1104,6 +1330,14 @@ def ascii_ltrim_whitespace( options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def ascii_ltrim_whitespace( + strings: Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) @@ -1133,6 +1367,16 @@ def ascii_split_whitespace( memory_pool: lib.MemoryPool | None = None, ) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... @overload +def ascii_split_whitespace( + strings: Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def split_pattern( strings: _StringOrBinaryScalarT, /, @@ -1154,6 +1398,17 @@ def split_pattern( options: SplitPatternOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def split_pattern( + strings: Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... split_pattern_regex = _clone_signature(split_pattern) utf8_split_whitespace = _clone_signature(ascii_split_whitespace) @@ -1177,6 +1432,15 @@ def extract_regex( options: ExtractRegexOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.StructArray: ... +@overload +def extract_regex( + strings: Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.16 String join ========================= def binary_join( @@ -1198,6 +1462,14 @@ def binary_join_element_wise( options: JoinOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringOrBinaryArrayT: ... +@overload +def binary_join_element_wise( + *strings: Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.17 String Slicing ========================= @overload @@ -1223,6 +1495,17 @@ def binary_slice( memory_pool: lib.MemoryPool | None = None, ) -> _BinaryArrayT: ... @overload +def binary_slice( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def utf8_slice_codeunits( strings: _StringScalarT, /, @@ -1244,6 +1527,17 @@ def utf8_slice_codeunits( options: SliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _StringArrayT: ... +@overload +def utf8_slice_codeunits( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.18 Containment tests ========================= @overload @@ -1286,6 +1580,16 @@ def count_substring( options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... +@overload +def count_substring( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... count_substring_regex = _clone_signature(count_substring) @@ -1309,6 +1613,16 @@ def ends_with( options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def ends_with( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... find_substring = _clone_signature(count_substring) find_substring_regex = _clone_signature(count_substring) @@ -1334,6 +1648,16 @@ def index_in( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int32Array: ... @overload +def index_in( + values: Expression, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def is_in( values: lib.Scalar, /, @@ -1353,6 +1677,16 @@ def is_in( options: SetLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def is_in( + values: Expression, + /, + value_set: lib.Array, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... match_like = _clone_signature(ends_with) match_substring = _clone_signature(ends_with) @@ -1368,6 +1702,10 @@ def is_finite( def is_finite( values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanArray: ... +@overload +def is_finite( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... is_inf = _clone_signature(is_finite) is_nan = _clone_signature(is_finite) @@ -1391,6 +1729,15 @@ def is_null( memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload +def is_null( + values: Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def is_valid( values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanScalar: ... @@ -1398,6 +1745,10 @@ def is_valid( def is_valid( values: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanArray: ... +@overload +def is_valid( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... true_unless_null = _clone_signature(is_valid) @@ -1426,6 +1777,13 @@ def list_value_length( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload +def list_value_length( + lists: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def make_struct( *args: lib.Scalar, field_names: list[str] | tuple[str, ...] = (), @@ -1443,6 +1801,15 @@ def make_struct( options: MakeStructOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.StructArray: ... +@overload +def make_struct( + *args: Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.22 Conversions ========================= @overload @@ -1495,6 +1862,31 @@ def ceil_temporal( options: RoundTemporalOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _TemporalArrayT: ... +@overload +def ceil_temporal( + timestamps: Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... floor_temporal = _clone_signature(ceil_temporal) round_temporal = _clone_signature(ceil_temporal) @@ -1536,6 +1928,16 @@ def strftime( memory_pool: lib.MemoryPool | None = None, ) -> lib.StringArray: ... @overload +def strftime( + timestamps: Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def strptime( strings: StringScalar, /, @@ -1557,6 +1959,17 @@ def strptime( options: StrptimeOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.TimestampArray: ... +@overload +def strptime( + strings: Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 2.23 Temporal component extraction ========================= @overload @@ -1568,6 +1981,8 @@ def day( values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... @overload +def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload def day_of_week( values: TemporalScalar, /, @@ -1587,6 +2002,16 @@ def day_of_week( options: DayOfWeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... +@overload +def day_of_week( + values: Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... day_of_year = _clone_signature(day) @@ -1605,6 +2030,13 @@ def hour( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload +def hour( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def is_dst( values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanScalar: ... @@ -1613,6 +2045,8 @@ def is_dst( values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanArray: ... @overload +def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload def iso_week( values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Scalar: ... @@ -1620,6 +2054,10 @@ def iso_week( def iso_week( values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... +@overload +def iso_week( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... iso_year = _clone_signature(iso_week) @@ -1637,6 +2075,13 @@ def is_leap_year( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... +@overload +def is_leap_year( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... microsecond = _clone_signature(iso_week) millisecond = _clone_signature(iso_week) @@ -1673,6 +2118,17 @@ def week( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload +def week( + values: Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def year_month_day( values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.StructScalar: ... @@ -1680,6 +2136,10 @@ def year_month_day( def year_month_day( values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.StructArray: ... +@overload +def year_month_day( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... # ========================= 2.24 Temporal difference ========================= def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... @@ -1738,6 +2198,17 @@ def assume_timezone( memory_pool: lib.MemoryPool | None = None, ) -> lib.TimestampArray: ... @overload +def assume_timezone( + timestamps: Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def local_timestamp( timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.TimestampScalar: ... @@ -1745,6 +2216,10 @@ def local_timestamp( def local_timestamp( timestamps: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.TimestampArray: ... +@overload +def local_timestamp( + timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... # ========================= 2.26 Random number generation ========================= def random( @@ -1758,7 +2233,7 @@ def random( # ========================= 3. Array-wise (“vector”) functions ========================= # ========================= 3.1 Cumulative Functions ========================= - +@overload def cumulative_sum( values: _NumericArrayT, /, @@ -1768,6 +2243,16 @@ def cumulative_sum( options: CumulativeSumOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _NumericArrayT: ... +@overload +def cumulative_sum( + values: Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... cumulative_sum_checked = _clone_signature(cumulative_sum) cumulative_prod = _clone_signature(cumulative_sum) @@ -1778,6 +2263,7 @@ cumulative_mean = _clone_signature(cumulative_sum) # ========================= 3.2 Associative transforms ========================= +@overload def dictionary_encode( array: _ScalarOrArrayT, /, @@ -1786,13 +2272,30 @@ def dictionary_encode( options=None, memory_pool: lib.MemoryPool | None = None, ) -> _ScalarOrArrayT: ... +@overload +def dictionary_encode( + array: Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload def value_counts( array: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.StructArray: ... +@overload +def value_counts( + array: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... # ========================= 3.3 Selections ========================= - +@overload def array_filter( array: _ArrayT, selection_filter: list[bool] | list[bool | None] | lib.BooleanArray, @@ -1802,6 +2305,17 @@ def array_filter( options: FilterOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | lib.BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def array_take( array: _ArrayT, indices: list[int] | list[int | None] | lib.Int16Array | lib.Int32Array | lib.Int64Array, @@ -1811,13 +2325,28 @@ def array_take( options: TakeOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] | list[int | None] | lib.Int16Array | lib.Int32Array | lib.Int64Array, + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... filter = array_filter take = array_take # ========================= 3.4 Containment tests ========================= - +@overload def indices_nonzero( values: lib.BooleanArray | lib.NullArray @@ -1828,8 +2357,16 @@ def indices_nonzero( *, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload +def indices_nonzero( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 3.5 Sorts and partitions ========================= +@overload def array_sort_indices( array: lib.Array, /, @@ -1839,6 +2376,17 @@ def array_sort_indices( options: ArraySortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload +def array_sort_indices( + array: Expression, + /, + order: Literal["ascending", "descending"] = "ascending", + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def partition_nth_indices( array: lib.Array, /, @@ -1848,6 +2396,16 @@ def partition_nth_indices( options: PartitionNthOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload +def partition_nth_indices( + array: Expression, + /, + pivot: int, + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... def rank( input: lib.Array, /, @@ -1858,6 +2416,7 @@ def rank( options: RankOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload def select_k_unstable( input: lib.Array, /, @@ -1867,6 +2426,17 @@ def select_k_unstable( options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload +def select_k_unstable( + input: Expression, + /, + k: int, + sort_keys: list[tuple[str, Literal["ascending", "descending"]]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def sort_indices( array: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, /, @@ -1876,11 +2446,36 @@ def sort_indices( options: SortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... +@overload +def sort_indices( + array: Expression, + /, + order: Literal["ascending", "descending"] = "ascending", + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... # ========================= 3.6 Structural transforms ========================= +@overload +def list_element( + lists: Expression, index, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload def list_element( lists, index, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.ListArray: ... +@overload +def list_flatten( + lists: Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def list_flatten( lists, /, @@ -1889,9 +2484,27 @@ def list_flatten( options: ListFlattenOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.ListArray: ... +@overload +def list_parent_indices( + lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload def list_parent_indices( lists, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... +@overload +def list_slice( + lists: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload def list_slice( lists, /, @@ -1932,6 +2545,7 @@ def replace_with_mask( ): ... # ========================= 3.7 Pairwise functions ========================= +@overload def pairwise_diff( input: _NumericOrTemporalArrayT, /, @@ -1940,5 +2554,14 @@ def pairwise_diff( options: PairwiseOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> _NumericOrTemporalArrayT: ... +@overload +def pairwise_diff( + input: Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... pairwise_diff_checked = _clone_signature(pairwise_diff) From 2a751f5f16c4e9d4e33788fc7a49cb69f400d6fc Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 12 Sep 2024 11:12:44 +0800 Subject: [PATCH 078/231] fix: Fix the return value of Expression comparison (#83) --- pyarrow-stubs/_compute.pyi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index 625cd6f05c4..73f6a1af27b 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -408,6 +408,12 @@ class Expression(lib._Weakrefable): def __add__(self, other) -> Expression: ... def __mul__(self, other) -> Expression: ... def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] def __truediv__(self, other) -> Expression: ... def is_valid(self) -> bool: ... def is_null(self, nan_is_null: bool = False) -> Expression: ... From 4e2a987392aae3218eafd85cab238b18529b92ad Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 12 Sep 2024 11:14:21 +0800 Subject: [PATCH 079/231] release 17.4 (#84) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 4e0003c0ef2..0ac2b36e269 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.3' + version: '17.4' path: . - sha256: 0e4733249cd6fd32b3b463efcb84c9bc647f78b612c275150dbe6480ced52fa8 + sha256: 9ca2368554eba242a6d6fb28b09e744f2169f1beaac1ca19409dc30e8bbddc67 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 1808106883c..8ae8f003d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.3" +version = "17.4" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From c0d156e9d7044c75173cda4cbbb7f6934e680fc6 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sat, 14 Sep 2024 10:42:27 +0800 Subject: [PATCH 080/231] fix: fix the array return type (#89) --- pyarrow-stubs/__lib_pxi/array.pyi | 348 +++++++++++------------------- 1 file changed, 128 insertions(+), 220 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 5888e837512..fac68c81c7a 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -185,353 +185,274 @@ def array( @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[Scalar[_DataTypeT]]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["null"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.NullScalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.BooleanScalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i1", "int8"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.Int8Scalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i2", "int16"], + type: Literal["null"] | types.NullType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Int16Scalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i4", "int32"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.Int32Scalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i8", "int64"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.Int64Scalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.UInt8Scalar]: ... -@overload -def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[scalar.UInt16Scalar]: ... +) -> NullArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u4", "uint32"], + type: Literal["bool", "boolean"] | types.BoolType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.UInt32Scalar]: ... +) -> BooleanArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"], + type: Literal["i1", "int8"] | types.Int8Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.UInt64Scalar]: ... +) -> Int8Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"], + type: Literal["i2", "int16"] | types.Int16Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.HalfFloatScalar]: ... +) -> Int16Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"], + type: Literal["i4", "int32"] | types.Int32Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.FloatScalar]: ... +) -> Int32Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"], + type: Literal["i8", "int64"] | types.Int64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.DoubleScalar]: ... +) -> Int64Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"], + type: Literal["u1", "uint8"] | types.Int8Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.StringScalar]: ... +) -> UInt8Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary"], + type: Literal["u2", "uint16"] | types.Uint16Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.BinaryScalar]: ... +) -> UInt16Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"], + type: Literal["u4", "uint32"] | types.Uint32Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.LargeStringScalar]: ... +) -> UInt32Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_binary"], + type: Literal["u8", "uint64"] | types.Uint64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.LargeBinaryScalar]: ... +) -> UInt64Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary_view"], + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.BinaryViewScalar]: ... +) -> HalfFloatArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string_view"], + type: Literal["f4", "float", "float32"] | types.Float32Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.StringViewScalar]: ... +) -> FloatArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"], + type: Literal["f8", "double", "float64"] | types.Float64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Date32Scalar]: ... +) -> DoubleArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"], + type: Literal["string", "str", "utf8"] | types.StringType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Date64Scalar]: ... +) -> StringArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]"], + type: Literal["binary"] | types.BinaryType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Time32Scalar[Literal["s"]]]: ... +) -> BinaryArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[ms]"], + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Time32Scalar[Literal["ms"]]]: ... +) -> LargeStringArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]"], + type: Literal["large_binary"] | types.LargeBinaryType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Time64Scalar[Literal["us"]]]: ... +) -> LargeBinaryArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[ns]"], + type: Literal["binary_view"] | types.BinaryViewType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.Time64Scalar[Literal["ns"]]]: ... +) -> BinaryViewArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]"], + type: Literal["string_view"] | types.StringViewType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.TimestampScalar[Literal["s"]]]: ... +) -> StringViewArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ms]"], + type: Literal["date32", "date32[day]"] | types.Date32Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.TimestampScalar[Literal["ms"]]]: ... +) -> Date32Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[us]"], + type: Literal["date64", "date64[ms]"] | types.Date64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.TimestampScalar[Literal["us"]]]: ... +) -> Date64Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ns]"], + type: Literal["time32[s]", "time32[ms]"] | types.Time32Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.TimestampScalar[Literal["ns"]]]: ... +) -> Time32Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]"], + type: Literal["time64[us]", "time64[ns]"] | types.Time64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.DurationScalar[Literal["s"]]]: ... +) -> Time64Array: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ms]"], + type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]"] | types.TimestampType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.DurationScalar[Literal["ms"]]]: ... +) -> TimestampArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[us]"], + type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"] + | types.DurationType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.DurationScalar[Literal["us"]]]: ... +) -> DurationArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ns]"], + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.DurationScalar[Literal["ns"]]]: ... +) -> MonthDayNanoIntervalArray: ... @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"], + type: _DataTypeT, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[scalar.MonthDayNanoIntervalScalar]: ... +) -> Array[Scalar[_DataTypeT]]: ... @overload def asarray(values: NullableIterable[bool]) -> BooleanArray: ... @overload @@ -555,153 +476,140 @@ def asarray(values: NullableIterable[list]) -> ListArray: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, -) -> Array[Scalar[_DataTypeT]]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["null"] -) -> Array[scalar.NullScalar]: ... + type: Literal["null"] | types.NullType, +) -> NullArray: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"], -) -> Array[scalar.BooleanScalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i1", "int8"] -) -> Array[scalar.Int8Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i2", "int16"] -) -> Array[scalar.Int16Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i4", "int32"] -) -> Array[scalar.Int32Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i8", "int64"] -) -> Array[scalar.Int64Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u1", "uint8"] -) -> Array[scalar.UInt8Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u2", "uint16"] -) -> Array[scalar.UInt16Scalar]: ... -@overload -def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u4", "uint32"] -) -> Array[scalar.UInt32Scalar]: ... + type: Literal["bool", "boolean"] | types.BoolType, +) -> BooleanArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u8", "uint64"] -) -> Array[scalar.UInt64Scalar]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, +) -> Int8Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"], -) -> Array[scalar.HalfFloatScalar]: ... + type: Literal["i2", "int16"] | types.Int16Type, +) -> Int16Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"], -) -> Array[scalar.FloatScalar]: ... + type: Literal["i4", "int32"] | types.Int32Type, +) -> Int32Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"], -) -> Array[scalar.DoubleScalar]: ... + type: Literal["i8", "int64"] | types.Int64Type, +) -> Int64Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"], -) -> Array[scalar.StringScalar]: ... + type: Literal["u1", "uint8"] | types.Uint8Type, +) -> UInt8Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary"] -) -> Array[scalar.BinaryScalar]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.Uint16Type, +) -> UInt16Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"], -) -> Array[scalar.LargeStringScalar]: ... + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> UInt32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_binary"] -) -> Array[scalar.LargeBinaryScalar]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.Uint64Type, +) -> UInt64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary_view"] -) -> Array[scalar.BinaryViewScalar]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> HalfFloatArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string_view"] -) -> Array[scalar.StringViewScalar]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> FloatArray: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"], -) -> Array[scalar.Date32Scalar]: ... + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> DoubleArray: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"], -) -> Array[scalar.Date64Scalar]: ... + type: Literal["string", "str", "utf8"] | types.StringType, +) -> StringArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time32[s]"] -) -> Array[scalar.Time32Scalar[Literal["s"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, +) -> BinaryArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time32[ms]"] -) -> Array[scalar.Time32Scalar[Literal["ms"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> LargeStringArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time64[us]"] -) -> Array[scalar.Time64Scalar[Literal["us"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, +) -> LargeBinaryArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["time64[ns]"] -) -> Array[scalar.Time64Scalar[Literal["ns"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, +) -> BinaryViewArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[s]"] -) -> Array[scalar.TimestampScalar[Literal["s"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, +) -> StringViewArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[ms]"] -) -> Array[scalar.TimestampScalar[Literal["ms"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> Date32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[us]"] -) -> Array[scalar.TimestampScalar[Literal["us"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> Date64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["timestamp[ns]"] -) -> Array[scalar.TimestampScalar[Literal["ns"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]", "time32[ms]"] | types.Time32Type, +) -> Time32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[s]"] -) -> Array[scalar.DurationScalar[Literal["s"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]", "time64[ns]"] | types.Time64Type, +) -> Time64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[ms]"] -) -> Array[scalar.DurationScalar[Literal["ms"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"] + | types.TimestampType, +) -> TimestampArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[us]"] -) -> Array[scalar.DurationScalar[Literal["us"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"] + | types.DurationType, +) -> DurationArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, type: Literal["duration[ns]"] -) -> Array[scalar.DurationScalar[Literal["ns"]]]: ... + values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> MonthDayNanoIntervalArray: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"], -) -> Array[scalar.MonthDayNanoIntervalScalar]: ... + type: _DataTypeT, +) -> Array[Scalar[_DataTypeT]]: ... @overload def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... @overload From 510e3bb6d0c324d9dffb8ac690384640b8293284 Mon Sep 17 00:00:00 2001 From: Dylan Scott Date: Sun, 15 Sep 2024 05:39:06 -0700 Subject: [PATCH 081/231] a few type improvements, mostly flight related (#90) * FlightError.extra_info -> bytes * annotate FlightStreamReader.cancel return * BasicAuth serialize/deserialize * RecordBatchFileReader.schema * actually str | bytes --- pyarrow-stubs/__lib_pxi/ipc.pyi | 2 ++ pyarrow-stubs/_flight.pyi | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi index e1e7a7688df..a6509dbb40e 100644 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -154,6 +154,8 @@ class _RecordBatchFileReader(_Weakrefable): def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @property + def schema(self) -> Schema: ... + @property def stats(self) -> ReadStats: ... def get_tensor_size(tensor: Tensor) -> int: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index d52ecfd7957..1ad8c514bc6 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -37,7 +37,7 @@ class CertKeyPair(NamedTuple): key: str class FlightError(Exception): - extra_info: str + extra_info: bytes class FlightInternalError(FlightError, ArrowException): ... class FlightTimedOutError(FlightError, ArrowException): ... @@ -83,6 +83,9 @@ class BasicAuth(_Weakrefable): def username(self) -> bytes: ... @property def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... class DescriptorType(enum.Enum): UNKNOWN = 0 @@ -196,7 +199,7 @@ class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): class MetadataRecordBatchReader(_MetadataRecordBatchReader): ... class FlightStreamReader(MetadataRecordBatchReader): - def cancel(self): ... + def cancel(self) -> None: ... def read_all(self) -> Table: ... class MetadataRecordBatchWriter(_CRecordBatchWriter): From 734f19b4095298ad657bf27c9db6cbc45e49c88d Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 15 Sep 2024 08:55:32 -0400 Subject: [PATCH 082/231] add_type_to_Field (#87) * add_type_to_Field * Field.type should return the covariant DataType --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/__lib_pxi/types.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index eb6042d42f7..259ff296094 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -254,6 +254,8 @@ class Field(_Weakrefable, Generic[_DataType_CoT]): def name(self) -> str: ... @property def metadata(self) -> dict[bytes, bytes] | None: ... + @property + def type(self) -> _DataType_CoT: ... def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... def remove_metadata(self) -> None: ... def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... From ffaa0dc71fb6ed6d01ba7866bd20777646097017 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 15 Sep 2024 08:58:40 -0400 Subject: [PATCH 083/231] Support fsspec.AbstractFileSystem (#88) * supported_filesystem * fixes * remove unused import --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/_dataset.pyi | 14 +++++++------- pyarrow-stubs/_dataset_parquet.pyi | 6 +++--- pyarrow-stubs/_fs.pyi | 6 +++++- pyarrow-stubs/dataset.pyi | 12 ++++++------ pyarrow-stubs/fs.pyi | 9 +++++---- pyarrow-stubs/orc.pyi | 4 ++-- pyarrow-stubs/parquet/core.pyi | 20 ++++++++++---------- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index d91a48d787b..fef3cbe6edd 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -13,7 +13,7 @@ from typing import ( ) from . import _csv, _json, _parquet, lib -from ._fs import FileSelector, FileSystem +from ._fs import FileSelector, FileSystem, SupportedFileSystem from ._stubs_typing import Indices, JoinType, Order from .acero import ExecNodeOptions from .compute import Expression @@ -129,7 +129,7 @@ class FileSystemDataset(Dataset): fragments: list[Fragment], schema: lib.Schema, format: FileFormat, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, root_partition: Expression | None = None, ) -> None: ... @classmethod @@ -138,7 +138,7 @@ class FileSystemDataset(Dataset): paths: list[str], schema: lib.Schema | None = None, format: FileFormat | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partitions: list[Expression] | None = None, root_partition: Expression | None = None, ) -> FileSystemDataset: ... @@ -157,12 +157,12 @@ class FileWriteOptions(lib._Weakrefable): class FileFormat(lib._Weakrefable): def inspect( - self, file: str | Path | IO, filesystem: FileSystem | None = None + self, file: str | Path | IO, filesystem: SupportedFileSystem | None = None ) -> lib.Schema: ... def make_fragment( self, file: str | Path | IO, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, *, file_size: int | None = None, @@ -402,7 +402,7 @@ class FileSystemFactoryOptions(lib._Weakrefable): class FileSystemDatasetFactory(DatasetFactory): def __init__( self, - filesystem: FileSystem, + filesystem: SupportedFileSystem, paths_or_selector: FileSelector, format: FileFormat, options: FileSystemFactoryOptions | None = None, @@ -503,7 +503,7 @@ def _filesystemdataset_write( data: Scanner, base_dir: str | Path, basename_template: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, partitioning: Partitioning, file_options: FileWriteOptions, max_partitions: int, diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index ce1a8403476..f5b2c93c7d7 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -14,7 +14,7 @@ from ._dataset import ( PartitioningFactory, ) from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import FileSystem +from ._fs import SupportedFileSystem from ._parquet import FileDecryptionProperties, FileMetaData from .lib import CacheOptions, Schema, _Weakrefable @@ -36,7 +36,7 @@ class ParquetFileFormat(FileFormat): def make_fragment( self, file: IO | Path | str, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, row_groups: Iterable[int] | None = None, *, @@ -118,7 +118,7 @@ class ParquetDatasetFactory(DatasetFactory): def __init__( self, metadata_path: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, format: FileFormat, options: ParquetFactoryOptions | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 3f87e3fe40a..4725b205839 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -2,10 +2,14 @@ import datetime as dt import enum from abc import ABC, abstractmethod -from typing import Self, overload +from typing import Self, TypeAlias, Union, overload + +from fsspec import AbstractFileSystem from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 1fcc4361c4a..3473fe4dfce 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -50,7 +50,7 @@ from pyarrow._dataset_parquet_encryption import ( from pyarrow.compute import Expression, field, scalar from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table -from ._fs import FileSystem +from ._fs import SupportedFileSystem _orc_available: bool _parquet_available: bool @@ -153,7 +153,7 @@ def partitioning( def parquet_dataset( metadata_path: str | Path, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, format: ParquetFileFormat | None = None, partitioning: Partitioning | PartitioningFactory | None = None, partition_base_dir: str | None = None, @@ -163,7 +163,7 @@ def dataset( source: str | list[str] | Path | list[Path], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -174,7 +174,7 @@ def dataset( source: list[Dataset], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -185,7 +185,7 @@ def dataset( source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -200,7 +200,7 @@ def write_dataset( partitioning: Partitioning | list[str] | None = None, partitioning_flavor: str | None = None, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, file_options: FileWriteOptions | None = None, use_threads: bool = True, max_partitions: int = 1024, diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi index a185cb640c1..6bf75616c13 100644 --- a/pyarrow-stubs/fs.pyi +++ b/pyarrow-stubs/fs.pyi @@ -8,6 +8,7 @@ from pyarrow._fs import ( # noqa _MockFileSystem, FileSystemHandler, PyFileSystem, + SupportedFileSystem, ) from pyarrow._azurefs import AzureFileSystem from pyarrow._hdfs import HadoopFileSystem @@ -30,16 +31,16 @@ FileStats = FileInfo def copy_files( source: str, destination: str, - source_filesystem: FileSystem | None = None, - destination_filesystem: FileSystem | None = None, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, *, chunk_size: int = 1024 * 1024, use_threads: bool = True, ) -> None: ... class FSSpecHandler(FileSystemHandler): # type: ignore[misc] - fs: FileSystem - def __init__(self, fs: FileSystem) -> None: ... + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... __all__ = [ # _fs diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index 1b2d277214d..c0104f15aa1 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,7 +1,7 @@ from typing import IO, Literal, Self from . import _orc -from ._fs import FileSystem +from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: @@ -71,7 +71,7 @@ class ORCWriter: def read_table( source: str | NativeFile | IO, columns: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> Table: ... def write_table( table: Table, diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 1611c822a18..2285bc53eab 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -3,7 +3,7 @@ from typing import IO, Callable, Iterator, Literal, Self, Sequence from pyarrow import _parquet from pyarrow._compute import Expression -from pyarrow._fs import FileSystem +from pyarrow._fs import FileSystem, SupportedFileSystem from pyarrow._parquet import ( ColumnChunkMetaData, ColumnSchema, @@ -70,7 +70,7 @@ class ParquetFile: decryption_properties: FileDecryptionProperties | None = None, thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, page_checksum_verification: bool = False, ): ... def __enter__(self) -> Self: ... @@ -129,7 +129,7 @@ class ParquetWriter: self, where: str | Path | IO, schema: Schema, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, flavor: str | None = None, version: Literal["1.0", "2.4", "2.6"] = ..., use_dictionary: bool = True, @@ -166,7 +166,7 @@ class ParquetDataset: def __init__( self, path_or_paths: str | list[str], - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, schema: Schema | None = None, *, filters: Expression | FilterTuple | list[FilterTuple] | None = None, @@ -213,7 +213,7 @@ def read_table( memory_map: bool = False, buffer_size: int = 0, partitioning: str | list[str] | Partitioning = "hive", - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, filters: Expression | FilterTuple | list[FilterTuple] | None = None, use_legacy_dataset: bool | None = None, ignore_prefixes: list[str] | None = None, @@ -240,7 +240,7 @@ def write_table( allow_truncated_timestamps: bool = False, data_page_size: int | None = None, flavor: str | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, compression_level: int | dict | None = None, use_byte_stream_split: bool = False, column_encoding: str | dict | None = None, @@ -260,7 +260,7 @@ def write_to_dataset( table: Table, root_path: str | Path, partition_cols: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, use_legacy_dataset: bool | None = None, schema: Schema | None = None, partitioning: Partitioning | list[str] | None = None, @@ -275,18 +275,18 @@ def write_metadata( schema: Schema, where: str | NativeFile, metadata_collector: list[FileMetaData] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, **kwargs, ) -> None: ... def read_metadata( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... def read_schema( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... From 625e9239f96a7e1ade11874364825b286b440a2c Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sun, 15 Sep 2024 21:00:25 +0800 Subject: [PATCH 084/231] release 17.5 (#91) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 0ac2b36e269..d599f6e2f80 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.4' + version: '17.5' path: . - sha256: 9ca2368554eba242a6d6fb28b09e744f2169f1beaac1ca19409dc30e8bbddc67 + sha256: e740edc24d185417fdf97662652d791a3770b110d1e9188a91b2b83a946dbaaf requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 8ae8f003d6d..8100dc6e007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.4" +version = "17.5" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 10d0bc6e2756b9022dffddc455a97cd196a038b3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:00:48 +0800 Subject: [PATCH 085/231] [pre-commit.ci] pre-commit autoupdate (#95) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90bda4c75ec..4d02579b5e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.4 + rev: v0.6.5 hooks: - id: ruff args: [--fix] From 9b42e32e99cacfd6728bc1605a78f6bd4123f750 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sat, 21 Sep 2024 16:08:48 +0800 Subject: [PATCH 086/231] fix: parquet not accepting NativeFile (#98) --- pyarrow-stubs/parquet/core.pyi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 2285bc53eab..2d41a1394d2 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -127,7 +127,7 @@ class ParquetWriter: def __init__( self, - where: str | Path | IO, + where: str | Path | IO | NativeFile, schema: Schema, filesystem: SupportedFileSystem | None = None, flavor: str | None = None, @@ -229,7 +229,7 @@ def read_pandas( ) -> Table: ... def write_table( table: Table, - where: str | Path | IO, + where: str | Path | NativeFile | IO, row_group_size: int | None = None, version: Literal["1.0", "2.4", "2.6"] = "2.6", use_dictionary: bool = True, @@ -279,13 +279,13 @@ def write_metadata( **kwargs, ) -> None: ... def read_metadata( - where: str | Path | IO, + where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... def read_schema( - where: str | Path | IO, + where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, From 76d8dd212d07efa14250a4e949212f330c9f647e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sat, 21 Sep 2024 16:26:57 +0800 Subject: [PATCH 087/231] feat: support pa.Buffer buffer protocol (#99) --- pyarrow-stubs/__lib_pxi/io.pyi | 1 + pyarrow-stubs/_stubs_typing.pyi | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi index f2d483a41ee..488df003227 100644 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -117,6 +117,7 @@ class Buffer(_Weakrefable): def equals(self, other: Self) -> bool: ... def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... def to_pybytes(self) -> bytes: ... + def __buffer__(self, flags: int, /) -> memoryview: ... class ResizableBuffer(Buffer): def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index f2f28a77494..29946e88e42 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -53,7 +53,8 @@ FilterTuple: TypeAlias = ( class Buffer(Protocol): def __buffer__(self, flags: int, /) -> memoryview: ... -SupportPyBuffer: TypeAlias = Any +class SupportPyBuffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... class SupportArrowStream(Protocol): def __arrow_c_stream__(self, requested_schema=None) -> Any: ... From c004517b96d2cd909cf36fc18506f1d23c289401 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sat, 21 Sep 2024 17:08:38 +0800 Subject: [PATCH 088/231] feat: Support `compute` functions to accept ChunkedArray. (#100) --- pyarrow-stubs/compute.pyi | 202 +++++++++++++++++++++++++++----------- 1 file changed, 143 insertions(+), 59 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 7ea12446b51..a3c2e0fbed3 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -137,12 +137,17 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalT = TypeVar("_NumericOrTemporalT", bound=NumericOrTemporalScalar) -NumericArray: TypeAlias = lib.NumericArray +NumericArray: TypeAlias = lib.NumericArray[_ScalarT] | lib.ChunkedArray[_ScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=lib.NumericArray) -NumericOrDurationArray: TypeAlias = lib.NumericArray | lib.Array[lib.DurationScalar] +NumericOrDurationArray: TypeAlias = ( + lib.NumericArray | lib.Array[lib.DurationScalar] | lib.ChunkedArray +) _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) -NumericOrTemporalArray: TypeAlias = lib.NumericArray | lib.Array[TemporalScalar] +NumericOrTemporalArray: TypeAlias = ( + lib.NumericArray | lib.Array[TemporalScalar] | lib.ChunkedArray[TemporalScalar] +) _NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = lib.BooleanArray | lib.ChunkedArray[lib.BooleanScalar] FloatScalar: typing_extensions.TypeAlias = ( lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] @@ -155,13 +160,27 @@ FloatArray: typing_extensions.TypeAlias = ( | lib.NumericArray[lib.DoubleScalar] | lib.NumericArray[lib.Decimal128Scalar] | lib.NumericArray[lib.Decimal256Scalar] + | lib.ChunkedArray[lib.FloatScalar] + | lib.ChunkedArray[lib.DoubleScalar] + | lib.ChunkedArray[lib.Decimal128Scalar] + | lib.ChunkedArray[lib.Decimal256Scalar] ) _FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) _StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) -StringArray: TypeAlias = lib.StringArray | lib.LargeStringArray +StringArray: TypeAlias = ( + lib.StringArray + | lib.LargeStringArray + | lib.ChunkedArray[lib.StringScalar] + | lib.ChunkedArray[lib.LargeStringScalar] +) _StringArrayT = TypeVar("_StringArrayT", bound=StringArray) _BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) -BinaryArray: TypeAlias = lib.BinaryArray | lib.LargeBinaryArray +BinaryArray: TypeAlias = ( + lib.BinaryArray + | lib.LargeBinaryArray + | lib.ChunkedArray[lib.BinaryScalar] + | lib.ChunkedArray[lib.LargeBinaryScalar] +) _BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar _StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) @@ -176,6 +195,12 @@ TemporalArray: TypeAlias = ( | lib.TimestampArray | lib.DurationArray | lib.MonthDayNanoIntervalArray + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar] + | lib.ChunkedArray[lib.Time32Scalar] + | lib.ChunkedArray[lib.Time64Scalar] + | lib.ChunkedArray[lib.DurationScalar] + | lib.ChunkedArray[lib.MonthDayNanoIntervalScalar] ) _TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) _ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) @@ -186,7 +211,7 @@ _ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar) # ========================= 1.1 functions ========================= def all( - array: lib.BooleanScalar | lib.BooleanArray, + array: lib.BooleanScalar | BooleanArray, /, *, skip_nulls: bool = True, @@ -198,7 +223,7 @@ def all( any = _clone_signature(all) def approximate_median( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, *, skip_nulls: bool = True, @@ -207,7 +232,7 @@ def approximate_median( memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleScalar: ... def count( - array: lib.Array, + array: lib.Array | lib.ChunkedArray, /, mode: Literal["only_valid", "only_null", "all"] = "only_valid", *, @@ -215,7 +240,7 @@ def count( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Scalar: ... def count_distinct( - array: lib.Array, + array: lib.Array | lib.ChunkedArray, /, mode: Literal["only_valid", "only_null", "all"] = "only_valid", *, @@ -223,7 +248,7 @@ def count_distinct( memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Scalar: ... def first( - array: lib.Array[_ScalarT], + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], /, *, skip_nulls: bool = True, @@ -232,7 +257,7 @@ def first( memory_pool: lib.MemoryPool | None = None, ) -> _ScalarT: ... def first_last( - array: lib.Array, + array: lib.Array | lib.ChunkedArray, /, *, skip_nulls: bool = True, @@ -241,7 +266,7 @@ def first_last( memory_pool: lib.MemoryPool | None = None, ) -> lib.StructScalar: ... def index( - data: lib.Array, + data: lib.Array | lib.ChunkedArray, value, start: int | None = None, end: int | None = None, @@ -255,7 +280,7 @@ min = _clone_signature(first) min_max = _clone_signature(first_last) def mean( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, *, skip_nulls: bool = True, @@ -264,7 +289,7 @@ def mean( memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleScalar | lib.Decimal128Scalar: ... def mode( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, n: int = 1, *, @@ -283,7 +308,7 @@ def product( memory_pool: lib.MemoryPool | None = None, ) -> _ScalarT: ... def quantile( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, q: float = 0.5, *, @@ -294,7 +319,7 @@ def quantile( memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleArray: ... def stddev( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, *, ddof: float = 0, @@ -304,7 +329,7 @@ def stddev( memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleScalar: ... def sum( - array: _NumericScalarT | lib.NumericArray[_NumericScalarT], + array: _NumericScalarT | NumericArray[_NumericScalarT], /, *, skip_nulls: bool = True, @@ -313,7 +338,7 @@ def sum( memory_pool: lib.MemoryPool | None = None, ) -> _NumericScalarT: ... def tdigest( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, q: float = 0.5, *, @@ -325,7 +350,7 @@ def tdigest( memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleArray: ... def variance( - array: NumericScalar | lib.NumericArray, + array: NumericScalar | NumericArray, /, *, ddof: int = 0, @@ -867,8 +892,8 @@ def equal( ) -> lib.BooleanScalar: ... @overload def equal( - x: lib.Scalar | lib.Array, - y: lib.Scalar | lib.Array, + x: lib.Scalar | lib.Array | lib.ChunkedArray, + y: lib.Scalar | lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None, @@ -919,8 +944,8 @@ def and_( ) -> lib.BooleanScalar: ... @overload def and_( - x: lib.BooleanScalar | lib.BooleanArray, - y: lib.BooleanScalar | lib.BooleanArray, + x: lib.BooleanScalar | BooleanArray, + y: lib.BooleanScalar | BooleanArray, /, *, memory_pool: lib.MemoryPool | None = None, @@ -1026,11 +1051,20 @@ def binary_length( ) -> lib.Int64Scalar: ... @overload def binary_length( - strings: lib.BinaryArray | lib.StringArray, /, *, memory_pool: lib.MemoryPool | None = None + strings: lib.BinaryArray + | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] + | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.Int32Array: ... @overload def binary_length( - strings: lib.LargeBinaryArray | lib.LargeStringArray, + strings: lib.LargeBinaryArray + | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] + | lib.ChunkedArray[lib.LargeStringScalar], /, *, memory_pool: lib.MemoryPool | None = None, @@ -1180,11 +1214,14 @@ def utf8_length( ) -> lib.Int64Scalar: ... @overload def utf8_length( - strings: lib.StringArray, /, *, memory_pool: lib.MemoryPool | None = None + strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.Int32Array: ... @overload def utf8_length( - strings: lib.LargeStringArray, + strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], /, *, memory_pool: lib.MemoryPool | None = None, @@ -1562,7 +1599,10 @@ def count_substring( ) -> lib.Int64Scalar: ... @overload def count_substring( - strings: lib.StringArray | lib.BinaryArray, + strings: lib.StringArray + | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] + | lib.ChunkedArray[lib.BinaryScalar], /, pattern: str, *, @@ -1572,7 +1612,10 @@ def count_substring( ) -> lib.Int32Array: ... @overload def count_substring( - strings: lib.LargeStringArray | lib.LargeBinaryArray, + strings: lib.LargeStringArray + | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] + | lib.ChunkedArray[lib.LargeBinaryScalar], /, pattern: str, *, @@ -1631,7 +1674,7 @@ find_substring_regex = _clone_signature(count_substring) def index_in( values: lib.Scalar, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1639,9 +1682,9 @@ def index_in( ) -> lib.Int32Scalar: ... @overload def index_in( - values: lib.Array, + values: lib.Array | lib.ChunkedArray, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1651,7 +1694,7 @@ def index_in( def index_in( values: Expression, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1661,7 +1704,7 @@ def index_in( def is_in( values: lib.Scalar, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1669,9 +1712,9 @@ def is_in( ) -> lib.BooleanScalar: ... @overload def is_in( - values: lib.Array, + values: lib.Array | lib.ChunkedArray, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1681,7 +1724,7 @@ def is_in( def is_in( values: Expression, /, - value_set: lib.Array, + value_set: lib.Array | lib.ChunkedArray, *, skip_nulls: bool = False, options: SetLookupOptions | None = None, @@ -1721,7 +1764,7 @@ def is_null( ) -> lib.BooleanScalar: ... @overload def is_null( - values: lib.Array, + values: lib.Array | lib.ChunkedArray, /, *, nan_is_null: bool = False, @@ -1743,7 +1786,7 @@ def is_valid( ) -> lib.BooleanScalar: ... @overload def is_valid( - values: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None + values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanArray: ... @overload def is_valid( @@ -1764,14 +1807,14 @@ def if_else(cond, left, right, /, *, memory_pool: lib.MemoryPool | None = None): @overload def list_value_length( - lists: lib.ListArray | lib.ListViewArray | lib.FixedSizeListArray, + lists: lib.ListArray | lib.ListViewArray | lib.FixedSizeListArray | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int32Array: ... @overload def list_value_length( - lists: lib.LargeListArray | lib.LargeListViewArray, + lists: lib.LargeListArray | lib.LargeListViewArray | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None, @@ -1794,7 +1837,7 @@ def make_struct( ) -> lib.StructScalar: ... @overload def make_struct( - *args: lib.Array, + *args: lib.Array | lib.ChunkedArray, field_names: list[str] | tuple[str, ...] = (), field_nullability: bool | None = None, field_metadata: list[lib.KeyValueMetadata] | None = None, @@ -1908,6 +1951,14 @@ def cast( memory_pool: lib.MemoryPool | None = None, ) -> lib.Array[lib.Scalar[_DataTypeT]]: ... @overload +def cast( + arr: lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +@overload def strftime( timestamps: TemporalScalar, /, @@ -2024,7 +2075,12 @@ def hour( ) -> lib.Int64Scalar: ... @overload def hour( - values: lib.TimestampArray | lib.Time32Array | lib.Time64Array, + values: lib.TimestampArray + | lib.Time32Array + | lib.Time64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Time32Scalar] + | lib.ChunkedArray[lib.Time64Scalar], /, *, memory_pool: lib.MemoryPool | None = None, @@ -2042,7 +2098,10 @@ def is_dst( ) -> lib.BooleanScalar: ... @overload def is_dst( - values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... @@ -2052,7 +2111,10 @@ def iso_week( ) -> lib.Int64Scalar: ... @overload def iso_week( - values: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload def iso_week( @@ -2070,7 +2132,12 @@ def is_leap_year( ) -> lib.BooleanScalar: ... @overload def is_leap_year( - values: lib.TimestampArray | lib.Date32Array | lib.Date64Array, + values: lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar], /, *, memory_pool: lib.MemoryPool | None = None, @@ -2108,7 +2175,7 @@ def week( ) -> lib.Int64Scalar: ... @overload def week( - values: lib.TimestampArray, + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], /, *, week_starts_monday: bool = True, @@ -2188,7 +2255,7 @@ def assume_timezone( ) -> lib.TimestampScalar: ... @overload def assume_timezone( - timestamps: lib.TimestampArray, + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], /, timezone: str, *, @@ -2214,7 +2281,10 @@ def local_timestamp( ) -> lib.TimestampScalar: ... @overload def local_timestamp( - timestamps: lib.TimestampArray, /, *, memory_pool: lib.MemoryPool | None = None + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.TimestampArray: ... @overload def local_timestamp( @@ -2287,7 +2357,7 @@ def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... @overload def value_counts( - array: lib.Array, /, *, memory_pool: lib.MemoryPool | None = None + array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.StructArray: ... @overload def value_counts( @@ -2298,7 +2368,7 @@ def value_counts( @overload def array_filter( array: _ArrayT, - selection_filter: list[bool] | list[bool | None] | lib.BooleanArray, + selection_filter: list[bool] | list[bool | None] | BooleanArray, /, null_selection_behavior: Literal["drop", "emit_null"] = "drop", *, @@ -2308,7 +2378,7 @@ def array_filter( @overload def array_filter( array: Expression, - selection_filter: list[bool] | list[bool | None] | lib.BooleanArray, + selection_filter: list[bool] | list[bool | None] | BooleanArray, /, null_selection_behavior: Literal["drop", "emit_null"] = "drop", *, @@ -2318,7 +2388,14 @@ def array_filter( @overload def array_take( array: _ArrayT, - indices: list[int] | list[int | None] | lib.Int16Array | lib.Int32Array | lib.Int64Array, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], /, *, boundscheck: bool = True, @@ -2328,7 +2405,14 @@ def array_take( @overload def array_take( array: Expression, - indices: list[int] | list[int | None] | lib.Int16Array | lib.Int32Array | lib.Int64Array, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], /, *, boundscheck: bool = True, @@ -2368,7 +2452,7 @@ def indices_nonzero( # ========================= 3.5 Sorts and partitions ========================= @overload def array_sort_indices( - array: lib.Array, + array: lib.Array | lib.ChunkedArray, /, order: Literal["ascending", "descending"] = "ascending", *, @@ -2388,7 +2472,7 @@ def array_sort_indices( ) -> Expression: ... @overload def partition_nth_indices( - array: lib.Array, + array: lib.Array | lib.ChunkedArray, /, pivot: int, *, @@ -2407,7 +2491,7 @@ def partition_nth_indices( memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... def rank( - input: lib.Array, + input: lib.Array | lib.ChunkedArray, /, sort_keys: Literal["ascending", "descending"] = "ascending", *, @@ -2418,7 +2502,7 @@ def rank( ) -> lib.UInt64Array: ... @overload def select_k_unstable( - input: lib.Array, + input: lib.Array | lib.ChunkedArray, /, k: int, sort_keys: list[tuple[str, Literal["ascending", "descending"]]], @@ -2537,7 +2621,7 @@ def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... def replace_with_mask( values, - mask: list[bool] | list[bool | None] | lib.BooleanArray, + mask: list[bool] | list[bool | None] | BooleanArray, replacements, /, *, From 54798a614403b833c2ceecdbcdeb60621a769d4e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sat, 21 Sep 2024 17:15:00 +0800 Subject: [PATCH 089/231] release 17.6 (#101) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index d599f6e2f80..9f38b3d8fdc 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.5' + version: '17.6' path: . - sha256: e740edc24d185417fdf97662652d791a3770b110d1e9188a91b2b83a946dbaaf + sha256: f46f5c83b12a987f97faae95f9d81f2b62da0d1faf8407c87c0af1c62de012cf requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 8100dc6e007..d45b7061c73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.5" +version = "17.6" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 05a033a24437d662353ec228ea7b3746b04b52e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:47:51 +0800 Subject: [PATCH 090/231] [pre-commit.ci] pre-commit autoupdate (#102) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d02579b5e1..93b92d3979d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.5 + rev: v0.6.7 hooks: - id: ruff args: [--fix] From cba5a5d21fa1b89265e3c3bdd821e4a7060a826d Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 8 Oct 2024 04:03:53 -0400 Subject: [PATCH 091/231] working towards making return signatures only have one type (mean and exp) (#105) * group_by_returns_TableGroupBy * return_single_type_for_mean_exp * revert table.pyi * compute.mean does not support BinaryScalar or BinaryArray --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/compute.pyi | 57 ++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index a3c2e0fbed3..af4c92d3ff6 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -84,7 +84,6 @@ from pyarrow._compute import register_tabular_function as register_tabular_funct from pyarrow._compute import register_vector_function as register_vector_function from . import lib -import typing_extensions _P = ParamSpec("_P") _R = TypeVar("_R") @@ -148,23 +147,16 @@ NumericOrTemporalArray: TypeAlias = ( ) _NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) BooleanArray: TypeAlias = lib.BooleanArray | lib.ChunkedArray[lib.BooleanScalar] -FloatScalar: typing_extensions.TypeAlias = ( - lib.Scalar[lib.Float32Type] - | lib.Scalar[lib.Float64Type] - | lib.Scalar[lib.Decimal128Type] - | lib.Scalar[lib.Decimal256Type] -) +FloatScalar: TypeAlias = lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +DecimalScalar: TypeAlias = lib.Scalar[lib.Decimal128Type] | lib.Scalar[lib.Decimal256Type] _FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) -FloatArray: typing_extensions.TypeAlias = ( +FloatArray: TypeAlias = ( lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar] - | lib.NumericArray[lib.Decimal128Scalar] - | lib.NumericArray[lib.Decimal256Scalar] | lib.ChunkedArray[lib.FloatScalar] | lib.ChunkedArray[lib.DoubleScalar] - | lib.ChunkedArray[lib.Decimal128Scalar] - | lib.ChunkedArray[lib.Decimal256Scalar] ) + _FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) _StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) StringArray: TypeAlias = ( @@ -279,15 +271,40 @@ max = _clone_signature(first) min = _clone_signature(first) min_max = _clone_signature(first_last) +@overload def mean( - array: NumericScalar | NumericArray, + array: FloatScalar | FloatArray, /, *, skip_nulls: bool = True, min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar | lib.Decimal128Scalar: ... +) -> lib.DoubleScalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal128Scalar] + | lib.ChunkedArray[lib.Decimal128Scalar] + | lib.Decimal128Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal128Scalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal256Scalar] + | lib.ChunkedArray[lib.Decimal256Scalar] + | lib.Decimal256Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal256Scalar: ... def mode( array: NumericScalar | NumericArray, /, @@ -446,14 +463,22 @@ def divide( divide_checked = _clone_signature(divide) +@overload +def exp( + exponent: lib.FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatArray: ... @overload def exp( exponent: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatArray | lib.DoubleArray: ... +) -> lib.DoubleArray: ... +@overload +def exp( + exponent: lib.FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar: ... @overload def exp( exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... +) -> lib.DoubleScalar: ... @overload def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... From ab97176d18d8052ec5b57dfebb7c7e8f1094c6fc Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 8 Oct 2024 04:09:10 -0400 Subject: [PATCH 092/231] a table group_by was returing Self but should return TableGroupBy (#104) group_by_returns_TableGroupBy --- pyarrow-stubs/__lib_pxi/table.pyi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 684b8f2fabd..38395690606 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -545,7 +545,7 @@ class Table(_Tabular[ChunkedArray]): def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... def rename_columns(self, names: dict[str, str]) -> Self: ... def drop(self, columns: str | list[str]) -> Self: ... - def group_by(self, keys: str | list[str], use_threads: bool = True) -> Self: ... + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: ... def join( self, right_table: Self, @@ -603,6 +603,9 @@ class TableGroupBy: def aggregate( self, aggregations: list[tuple[str, str]] | list[tuple[str, str, FunctionOptions]] ) -> Table: ... + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... __all__ = [ "ChunkedArray", From 516215bfef7c4ba3abdc665d4111c9cefc3b639d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 16:09:24 +0800 Subject: [PATCH 093/231] [pre-commit.ci] pre-commit autoupdate (#106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0) - [github.com/astral-sh/ruff-pre-commit: v0.6.7 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.7...v0.6.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 93b92d3979d..cc257d96674 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ default_language_version: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.7 + rev: v0.6.9 hooks: - id: ruff args: [--fix] From f56e455a2b98404b47bb5229203d250ee896d5da Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 8 Oct 2024 16:22:02 +0800 Subject: [PATCH 094/231] fix: RecordBatch missing `from_arrays` and `from_pandas` (#108) --- pyarrow-stubs/__lib_pxi/table.pyi | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 38395690606..1d2a4c4e4c5 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -456,6 +456,23 @@ class RecordBatch(_Tabular[Array]): self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None ) -> Self: ... @classmethod + def from_arrays( + cls, + arrays: list[Array] | list[ChunkedArray], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: ... + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: ... + @classmethod def from_struct_array( cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] ) -> Self: ... From 73e59ff2b4604341a01bc8404aa4215623cad3fc Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 8 Oct 2024 16:23:50 +0800 Subject: [PATCH 095/231] release 17.7 (#109) --- pixi.lock | 42 +++++++++++++++++++++--------------------- pyproject.toml | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pixi.lock b/pixi.lock index 9f38b3d8fdc..d3c0ac4af97 100644 --- a/pixi.lock +++ b/pixi.lock @@ -278,12 +278,12 @@ packages: sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 requires_dist: - six>=1.12.0 - - typing ; python_full_version < '3.5' - - astroid<2,>=1 ; python_full_version < '3' and extra == 'astroid' - - astroid<4,>=2 ; python_full_version >= '3' and extra == 'astroid' + - typing ; python_version < '3.5' + - astroid<2,>=1 ; python_version < '3' and extra == 'astroid' + - astroid<4,>=2 ; python_version >= '3' and extra == 'astroid' - pytest ; extra == 'test' - - astroid<2,>=1 ; python_full_version < '3' and extra == 'test' - - astroid<4,>=2 ; python_full_version >= '3' and extra == 'test' + - astroid<2,>=1 ; python_version < '3' and extra == 'test' + - astroid<4,>=2 ; python_version >= '3' and extra == 'test' - kind: conda name: bzip2 version: 1.0.8 @@ -434,7 +434,7 @@ packages: - coverage ; extra == 'tests' - coverage-enable-subprocess ; extra == 'tests' - littleutils ; extra == 'tests' - - rich ; python_full_version >= '3.11' and extra == 'tests' + - rich ; python_version >= '3.11' and extra == 'tests' requires_python: '>=3.5' - kind: pypi name: filelock @@ -454,7 +454,7 @@ packages: - pytest-timeout>=2.2 ; extra == 'testing' - pytest>=7.4.3 ; extra == 'testing' - virtualenv>=20.26.2 ; extra == 'testing' - - typing-extensions>=4.8 ; python_full_version < '3.11' and extra == 'typing' + - typing-extensions>=4.8 ; python_version < '3.11' and extra == 'typing' requires_python: '>=3.8' - kind: pypi name: hatchling @@ -465,7 +465,7 @@ packages: - packaging>=23.2 - pathspec>=0.10.1 - pluggy>=1.0.0 - - tomli>=1.2.2 ; python_full_version < '3.11' + - tomli>=1.2.2 ; python_version < '3.11' - trove-classifiers requires_python: '>=3.8' - kind: pypi @@ -489,9 +489,9 @@ packages: - pygments>=2.4.0 - stack-data - traitlets>=5.13.0 - - exceptiongroup ; python_full_version < '3.11' - - typing-extensions>=4.6 ; python_full_version < '3.12' - - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' + - exceptiongroup ; python_version < '3.11' + - typing-extensions>=4.6 ; python_version < '3.12' + - pexpect>4.3 ; sys_platform != 'win32' and sys_platform != 'emscripten' - colorama ; sys_platform == 'win32' - ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole] ; extra == 'all' - ipython[test,test-extra] ; extra == 'all' @@ -507,7 +507,7 @@ packages: - sphinx>=1.3 ; extra == 'doc' - sphinxcontrib-jquery ; extra == 'doc' - typing-extensions ; extra == 'doc' - - tomli ; python_full_version < '3.11' and extra == 'doc' + - tomli ; python_version < '3.11' and extra == 'doc' - ipykernel ; extra == 'kernel' - matplotlib ; extra == 'matplotlib' - nbconvert ; extra == 'nbconvert' @@ -938,7 +938,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' + - tomli>=1.1.0 ; python_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -952,7 +952,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' + - tomli>=1.1.0 ; python_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -966,7 +966,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' + - tomli>=1.1.0 ; python_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -980,7 +980,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' + - tomli>=1.1.0 ; python_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.6' + version: '17.7' path: . - sha256: f46f5c83b12a987f97faae95f9d81f2b62da0d1faf8407c87c0af1c62de012cf + sha256: 6c4b7a004dcae812278da62c7967967353eb706e1afa422af9842d71cfb4416f requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' @@ -1892,7 +1892,7 @@ packages: requires_dist: - distlib<1,>=0.3.7 - filelock<4,>=3.12.2 - - importlib-metadata>=6.6 ; python_full_version < '3.8' + - importlib-metadata>=6.6 ; python_version < '3.8' - platformdirs<5,>=3.9.1 - furo>=2023.7.26 ; extra == 'docs' - proselint>=0.13 ; extra == 'docs' @@ -1906,7 +1906,7 @@ packages: - flaky>=3.7 ; extra == 'test' - packaging>=23.1 ; extra == 'test' - pytest-env>=0.8.2 ; extra == 'test' - - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') + - pytest-freezer>=0.4.8 ; (platform_python_implementation == 'PyPy' or (platform_python_implementation == 'CPython' and sys_platform == 'win32' and python_version >= '3.13')) and extra == 'test' - pytest-mock>=3.11.1 ; extra == 'test' - pytest-randomly>=3.12 ; extra == 'test' - pytest-timeout>=2.1 ; extra == 'test' @@ -1936,7 +1936,7 @@ packages: url: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 requires_dist: - - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' + - backports-functools-lru-cache>=1.2.1 ; python_version < '3.2' - kind: conda name: xz version: 5.2.6 diff --git a/pyproject.toml b/pyproject.toml index d45b7061c73..eeb5bcb7c66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.6" +version = "17.7" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 4a2db703f636c61c7dc987fbb1e45d322598e53b Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:53:39 -0400 Subject: [PATCH 096/231] fix_combine_chunks (#110) --- pyarrow-stubs/__lib_pxi/table.pyi | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 1d2a4c4e4c5..80ace93756f 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -95,9 +95,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): ) -> ChunkedArray[Scalar[_CastAs]]: ... def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... - def combine_chunks( - self, memory_pool: MemoryPool | None = None - ) -> ChunkedArray[_Scalar_CoT]: ... + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_CoT]: ... def unique(self) -> ChunkedArray[_Scalar_CoT]: ... def value_counts(self) -> StructArray: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... From 316d09c54b70dfbecca1047fd0622f054c3102d5 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:54:50 -0400 Subject: [PATCH 097/231] make Self backward compatible (#115) --- pyarrow-stubs/__lib_pxi/array.pyi | 12 ++++++++++-- pyarrow-stubs/__lib_pxi/error.pyi | 7 ++++++- pyarrow-stubs/__lib_pxi/io.pyi | 13 ++++++++++++- pyarrow-stubs/__lib_pxi/ipc.pyi | 8 +++++++- pyarrow-stubs/__lib_pxi/scalar.pyi | 12 +++++++++++- pyarrow-stubs/__lib_pxi/table.pyi | 12 ++++++++++-- pyarrow-stubs/__lib_pxi/tensor.pyi | 7 ++++++- pyarrow-stubs/__lib_pxi/types.pyi | 8 +++++++- pyarrow-stubs/_dataset.pyi | 8 +++++++- pyarrow-stubs/_flight.pyi | 7 ++++++- pyarrow-stubs/_fs.pyi | 12 +++++++++++- pyarrow-stubs/acero.pyi | 12 +++++++++++- pyarrow-stubs/interchange/dataframe.pyi | 8 +++++++- pyarrow-stubs/orc.pyi | 8 +++++++- pyarrow-stubs/parquet/core.pyi | 9 ++++++++- 15 files changed, 126 insertions(+), 17 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index fac68c81c7a..9245a3dfcc5 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1,17 +1,25 @@ # mypy: disable-error-code="overload-overlap,misc,type-arg" import datetime as dt +import sys from collections.abc import Callable from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias from typing import ( Any, Generic, Iterable, Iterator, Literal, - Self, - TypeAlias, TypeVar, overload, ) diff --git a/pyarrow-stubs/__lib_pxi/error.pyi b/pyarrow-stubs/__lib_pxi/error.pyi index 92ec0e3f0bb..981ed51e680 100644 --- a/pyarrow-stubs/__lib_pxi/error.pyi +++ b/pyarrow-stubs/__lib_pxi/error.pyi @@ -1,4 +1,9 @@ -from typing import Self +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self class ArrowException(Exception): ... class ArrowInvalid(ValueError, ArrowException): ... diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi index 488df003227..2cc7fe8b4ac 100644 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -1,7 +1,18 @@ +import sys + from collections.abc import Callable from io import IOBase from os import PathLike -from typing import Any, Literal, Self, SupportsIndex, TypeAlias, overload + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Literal, SupportsIndex, overload from pyarrow._stubs_typing import Compression, SupportPyBuffer from pyarrow.lib import MemoryPool, _Weakrefable diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi index a6509dbb40e..f768bb185cb 100644 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -1,7 +1,13 @@ import enum +import sys from io import IOBase -from typing import Iterable, Iterator, Literal, Mapping, NamedTuple, Self + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple import pandas as pd diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 9a56134133f..1a80e4c8747 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -1,9 +1,19 @@ # mypy: disable-error-code="overload-overlap,misc,type-arg" import collections.abc import datetime as dt +import sys from decimal import Decimal -from typing import Any, Generic, Iterator, Mapping, Self, TypeAlias, overload + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Generic, Iterator, Mapping, overload import numpy as np diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 80ace93756f..80c875ff169 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -1,8 +1,18 @@ # mypy: disable-error-code="overload-overlap,type-arg,misc" import datetime as dt +import sys from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias from typing import ( Any, Generator, @@ -11,9 +21,7 @@ from typing import ( Iterator, Literal, Mapping, - Self, Sequence, - TypeAlias, TypeVar, overload, ) diff --git a/pyarrow-stubs/__lib_pxi/tensor.pyi b/pyarrow-stubs/__lib_pxi/tensor.pyi index a23414ef9fd..01a1ea94e98 100644 --- a/pyarrow-stubs/__lib_pxi/tensor.pyi +++ b/pyarrow-stubs/__lib_pxi/tensor.pyi @@ -1,6 +1,11 @@ # mypy: disable-error-code="import-untyped" -from typing import Self +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self import numpy as np diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 259ff296094..8605eb087bd 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -1,8 +1,14 @@ import datetime as dt +import sys from collections.abc import Mapping from decimal import Decimal -from typing import Any, Generic, Iterable, Iterator, Literal, Self, overload + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Generic, Iterable, Iterator, Literal, overload import numpy as np import pandas as pd diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index fef3cbe6edd..3d1681ec941 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -1,4 +1,11 @@ +import sys + from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self from typing import ( IO, Any, @@ -7,7 +14,6 @@ from typing import ( Iterator, Literal, NamedTuple, - Self, TypeVar, overload, ) diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 1ad8c514bc6..89a8952cab7 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -1,7 +1,12 @@ import asyncio import enum +import sys -from typing import Generator, Generic, Iterable, Iterator, NamedTuple, Self, TypeVar +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar from typing_extensions import deprecated diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 4725b205839..581b5096e9d 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -1,8 +1,18 @@ import datetime as dt import enum +import sys from abc import ABC, abstractmethod -from typing import Self, TypeAlias, Union, overload + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Union, overload from fsspec import AbstractFileSystem diff --git a/pyarrow-stubs/acero.pyi b/pyarrow-stubs/acero.pyi index 8b26f40b04f..8a520bdc24a 100644 --- a/pyarrow-stubs/acero.pyi +++ b/pyarrow-stubs/acero.pyi @@ -1,4 +1,14 @@ -from typing import Literal, Self, TypeAlias +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal from . import lib from .compute import Expression, FunctionOptions diff --git a/pyarrow-stubs/interchange/dataframe.pyi b/pyarrow-stubs/interchange/dataframe.pyi index 880b8b6e80a..91cb6e70d7a 100644 --- a/pyarrow-stubs/interchange/dataframe.pyi +++ b/pyarrow-stubs/interchange/dataframe.pyi @@ -1,4 +1,10 @@ -from typing import Any, Iterable, Self, Sequence +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence from pyarrow.interchange.column import _PyArrowColumn from pyarrow.lib import RecordBatch, Table diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index c0104f15aa1..697100ecaf3 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,4 +1,10 @@ -from typing import IO, Literal, Self +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal from . import _orc from ._fs import SupportedFileSystem diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 2d41a1394d2..85f2d8e6702 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -1,5 +1,12 @@ +import sys + from pathlib import Path -from typing import IO, Callable, Iterator, Literal, Self, Sequence + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence from pyarrow import _parquet from pyarrow._compute import Expression From 2e09a902b5c012b1d3fa98405b60215436c07375 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:58:15 -0400 Subject: [PATCH 098/231] fix: update ConvertOptions (#114) --- pyarrow-stubs/_csv.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 67fe4ba6567..cd3e192db28 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -31,7 +31,7 @@ class ParseOptions(lib._Weakrefable): @dataclass(kw_only=True) class ConvertOptions(lib._Weakrefable): check_utf8: bool = field(default=True, kw_only=False) - check_types: lib.Schema | dict | None = None + column_types: lib.Schema | dict | None = None null_values: list[str] | None = None true_values: list[str] | None = None false_values: list[str] | None = None From a2a3b74fb53b7d5e238068a30c12e4276f36855b Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:04:21 -0400 Subject: [PATCH 099/231] add type property to Array (#112) * add type property to Array * Array.type should return covariant --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/__lib_pxi/array.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 9245a3dfcc5..8a3f079f821 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1064,6 +1064,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): ) -> Array[Scalar[_CastAs]]: ... def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... def sum(self, **kwargs) -> _Scalar_CoT: ... + @property + def type(self: Array[Scalar[_DataType_CoT]]) -> _DataType_CoT: ... def unique(self) -> Self: ... def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... @overload From caa15a807a457e1a916642ba2c7719ae09109479 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 16 Oct 2024 10:06:31 +0800 Subject: [PATCH 100/231] release 17.8 (#117) --- pixi.lock | 42 +++++++++++++++++++++--------------------- pyproject.toml | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pixi.lock b/pixi.lock index d3c0ac4af97..c17f25af2db 100644 --- a/pixi.lock +++ b/pixi.lock @@ -278,12 +278,12 @@ packages: sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 requires_dist: - six>=1.12.0 - - typing ; python_version < '3.5' - - astroid<2,>=1 ; python_version < '3' and extra == 'astroid' - - astroid<4,>=2 ; python_version >= '3' and extra == 'astroid' + - typing ; python_full_version < '3.5' + - astroid<2,>=1 ; python_full_version < '3' and extra == 'astroid' + - astroid<4,>=2 ; python_full_version >= '3' and extra == 'astroid' - pytest ; extra == 'test' - - astroid<2,>=1 ; python_version < '3' and extra == 'test' - - astroid<4,>=2 ; python_version >= '3' and extra == 'test' + - astroid<2,>=1 ; python_full_version < '3' and extra == 'test' + - astroid<4,>=2 ; python_full_version >= '3' and extra == 'test' - kind: conda name: bzip2 version: 1.0.8 @@ -434,7 +434,7 @@ packages: - coverage ; extra == 'tests' - coverage-enable-subprocess ; extra == 'tests' - littleutils ; extra == 'tests' - - rich ; python_version >= '3.11' and extra == 'tests' + - rich ; python_full_version >= '3.11' and extra == 'tests' requires_python: '>=3.5' - kind: pypi name: filelock @@ -454,7 +454,7 @@ packages: - pytest-timeout>=2.2 ; extra == 'testing' - pytest>=7.4.3 ; extra == 'testing' - virtualenv>=20.26.2 ; extra == 'testing' - - typing-extensions>=4.8 ; python_version < '3.11' and extra == 'typing' + - typing-extensions>=4.8 ; python_full_version < '3.11' and extra == 'typing' requires_python: '>=3.8' - kind: pypi name: hatchling @@ -465,7 +465,7 @@ packages: - packaging>=23.2 - pathspec>=0.10.1 - pluggy>=1.0.0 - - tomli>=1.2.2 ; python_version < '3.11' + - tomli>=1.2.2 ; python_full_version < '3.11' - trove-classifiers requires_python: '>=3.8' - kind: pypi @@ -489,9 +489,9 @@ packages: - pygments>=2.4.0 - stack-data - traitlets>=5.13.0 - - exceptiongroup ; python_version < '3.11' - - typing-extensions>=4.6 ; python_version < '3.12' - - pexpect>4.3 ; sys_platform != 'win32' and sys_platform != 'emscripten' + - exceptiongroup ; python_full_version < '3.11' + - typing-extensions>=4.6 ; python_full_version < '3.12' + - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' - colorama ; sys_platform == 'win32' - ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole] ; extra == 'all' - ipython[test,test-extra] ; extra == 'all' @@ -507,7 +507,7 @@ packages: - sphinx>=1.3 ; extra == 'doc' - sphinxcontrib-jquery ; extra == 'doc' - typing-extensions ; extra == 'doc' - - tomli ; python_version < '3.11' and extra == 'doc' + - tomli ; python_full_version < '3.11' and extra == 'doc' - ipykernel ; extra == 'kernel' - matplotlib ; extra == 'matplotlib' - nbconvert ; extra == 'nbconvert' @@ -938,7 +938,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -952,7 +952,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -966,7 +966,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -980,7 +980,7 @@ packages: requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_version < '3.11' + - tomli>=1.1.0 ; python_full_version < '3.11' - psutil>=4.0 ; extra == 'dmypy' - pip ; extra == 'install-types' - setuptools>=50 ; extra == 'mypyc' @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.7' + version: '17.8' path: . - sha256: 6c4b7a004dcae812278da62c7967967353eb706e1afa422af9842d71cfb4416f + sha256: be626e7bd93fed822c2e9f3741c4aeaea41cfca1ed3b79769bdf1a1e70d04487 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' @@ -1892,7 +1892,7 @@ packages: requires_dist: - distlib<1,>=0.3.7 - filelock<4,>=3.12.2 - - importlib-metadata>=6.6 ; python_version < '3.8' + - importlib-metadata>=6.6 ; python_full_version < '3.8' - platformdirs<5,>=3.9.1 - furo>=2023.7.26 ; extra == 'docs' - proselint>=0.13 ; extra == 'docs' @@ -1906,7 +1906,7 @@ packages: - flaky>=3.7 ; extra == 'test' - packaging>=23.1 ; extra == 'test' - pytest-env>=0.8.2 ; extra == 'test' - - pytest-freezer>=0.4.8 ; (platform_python_implementation == 'PyPy' or (platform_python_implementation == 'CPython' and sys_platform == 'win32' and python_version >= '3.13')) and extra == 'test' + - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') - pytest-mock>=3.11.1 ; extra == 'test' - pytest-randomly>=3.12 ; extra == 'test' - pytest-timeout>=2.1 ; extra == 'test' @@ -1936,7 +1936,7 @@ packages: url: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 requires_dist: - - backports-functools-lru-cache>=1.2.1 ; python_version < '3.2' + - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' - kind: conda name: xz version: 5.2.6 diff --git a/pyproject.toml b/pyproject.toml index eeb5bcb7c66..3ac95e8c17b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.7" +version = "17.8" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 34a956fe77d71dae04239baa1ef2e3b3d0ce15fe Mon Sep 17 00:00:00 2001 From: Jan Moravec Date: Sat, 19 Oct 2024 06:56:57 +0200 Subject: [PATCH 101/231] Add include_columns parameter in ConvertOptions (#118) --- pyarrow-stubs/_csv.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index cd3e192db28..9c3bd94e364 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -38,6 +38,7 @@ class ConvertOptions(lib._Weakrefable): decimal_point: str = "." strings_can_be_null: bool = False quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None include_missing_columns: bool = False auto_dict_encode: bool = False auto_dict_max_cardinality: int | None = None From b7775678a3b58446c8de8106863658864cb52d51 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Sat, 19 Oct 2024 00:59:25 -0400 Subject: [PATCH 102/231] add list[str] overload to rename_columns (#119) --- pyarrow-stubs/__lib_pxi/table.pyi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 80c875ff169..9c1fb169e3c 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -453,6 +453,9 @@ class RecordBatch(_Tabular[Array]): def get_total_buffer_size(self) -> int: ... def __sizeof__(self) -> int: ... def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload def rename_columns(self, names: dict[str, str]) -> Self: ... def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... @@ -566,6 +569,9 @@ class Table(_Tabular[ChunkedArray]): def get_total_buffer_size(self) -> int: ... def __sizeof__(self) -> int: ... def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload def rename_columns(self, names: dict[str, str]) -> Self: ... def drop(self, columns: str | list[str]) -> Self: ... def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: ... From 577b74bf633ee6e197009df8e0a8e09354ab3099 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 21 Oct 2024 10:15:05 +0800 Subject: [PATCH 103/231] release 17.9 (#120) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index c17f25af2db..d4d281da2dd 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1293,9 +1293,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.8' + version: '17.9' path: . - sha256: be626e7bd93fed822c2e9f3741c4aeaea41cfca1ed3b79769bdf1a1e70d04487 + sha256: 78ad9aa13f92b8ac322f4c09edeb064b3e3c412b70d661e192ef6aabe143f0e5 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 3ac95e8c17b..1b53c166102 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.8" +version = "17.9" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 242642cffd7a44d14014379a7cc830744e3c6472 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:25:41 +0800 Subject: [PATCH 104/231] [pre-commit.ci] pre-commit autoupdate (#124) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.9 → v0.7.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.9...v0.7.0) - [github.com/pre-commit/mirrors-mypy: v1.11.2 → v1.12.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.2...v1.12.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc257d96674..8817432dc65 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,13 +19,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.7.0 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 + rev: v1.12.1 hooks: - id: mypy From 428d96b0c3d9c056ff6295081ef79ac07e00dfc6 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 21 Oct 2024 22:26:17 -0400 Subject: [PATCH 105/231] improve type annotations for parquet writer (#125) Add support for per-field compression specification Add missing none compression value. --- pyarrow-stubs/parquet/core.pyi | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 85f2d8e6702..d675fb4f916 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -8,6 +8,11 @@ else: from typing_extensions import Self from typing import IO, Callable, Iterator, Literal, Sequence +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + from pyarrow import _parquet from pyarrow._compute import Expression from pyarrow._fs import FileSystem, SupportedFileSystem @@ -24,7 +29,7 @@ from pyarrow._parquet import ( SortingColumn, Statistics, ) -from pyarrow._stubs_typing import Compression, FilterTuple +from pyarrow._stubs_typing import FilterTuple from pyarrow.dataset import ParquetFileFragment, Partitioning from pyarrow.lib import NativeFile, RecordBatch, Schema, Table from typing_extensions import deprecated @@ -59,6 +64,8 @@ def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Exp @deprecated("use filters_to_expression") def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... +_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] + class ParquetFile: reader: ParquetReader common_metadata: FileMetaData @@ -140,7 +147,7 @@ class ParquetWriter: flavor: str | None = None, version: Literal["1.0", "2.4", "2.6"] = ..., use_dictionary: bool = True, - compression: Compression = "snappy", + compression: _Compression | dict[str, _Compression] = "snappy", write_statistics: bool | list = True, use_deprecated_int96_timestamps: bool | None = None, compression_level: int | dict | None = None, @@ -240,7 +247,7 @@ def write_table( row_group_size: int | None = None, version: Literal["1.0", "2.4", "2.6"] = "2.6", use_dictionary: bool = True, - compression: Compression = "snappy", + compression: _Compression | dict[str, _Compression] = "snappy", write_statistics: bool | list = True, use_deprecated_int96_timestamps: bool | None = None, coerce_timestamps: str | None = None, From fcfe788325796ee6dc7f369aa666aaa21a350afe Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 21 Oct 2024 22:26:45 -0400 Subject: [PATCH 106/231] Add missing return type for Schema.serialize (#123) --- pyarrow-stubs/__lib_pxi/types.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 8605eb087bd..10eb3949982 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -24,6 +24,7 @@ from pyarrow.lib import ( ) from typing_extensions import TypeVar +from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") @@ -304,7 +305,7 @@ class Schema(_Weakrefable): def set(self, i: int, field: Field) -> Schema: ... def add_metadata(self, metadata: dict) -> Schema: ... def with_metadata(self, metadata: dict) -> Schema: ... - def serialize(self, memory_pool: MemoryPool | None = None): ... + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... def remove_metadata(self) -> Schema: ... def to_string( self, From eeee4cbbe207034115e7eb4f0d072f9559a0ca01 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 21 Oct 2024 22:27:15 -0400 Subject: [PATCH 107/231] Add `Schema.field(int)` (#122) --- pyarrow-stubs/__lib_pxi/types.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 10eb3949982..0d3e6101795 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -8,6 +8,7 @@ if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self + from typing import Any, Generic, Iterable, Iterator, Literal, overload import numpy as np @@ -295,7 +296,7 @@ class Schema(_Weakrefable): def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... @classmethod def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: ... - def field(self, i: str | bytes) -> Field: ... + def field(self, i: int | str | bytes) -> Field: ... def field_by_name(self, name: str) -> Field: ... def get_field_index(self, name: str) -> int: ... def get_all_field_indices(self, name: str) -> list[int]: ... From a0adbc954ad9eb77e2983e24d4baa6d540dde308 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 21 Oct 2024 22:32:24 -0400 Subject: [PATCH 108/231] Change various io related functions to support `StrPath` as a path input (#121) * Change various io related functions to support StrPath as a path input * fmt * Added StrPath | IO for feather types --- pixi.lock | 1406 +++++++++++++++------------- pyarrow-stubs/__lib_pxi/io.pyi | 20 +- pyarrow-stubs/_csv.pyi | 11 +- pyarrow-stubs/_dataset.pyi | 10 +- pyarrow-stubs/_dataset_parquet.pyi | 5 +- pyarrow-stubs/_feather.pyi | 7 +- pyarrow-stubs/_fs.pyi | 1 + pyarrow-stubs/_hdfs.pyi | 4 +- pyarrow-stubs/_json.pyi | 5 +- pyarrow-stubs/_parquet.pyi | 7 +- pyarrow-stubs/dataset.pyi | 8 +- pyarrow-stubs/feather.pyi | 7 +- pyarrow-stubs/orc.pyi | 8 +- pyproject.toml | 13 + 14 files changed, 847 insertions(+), 665 deletions(-) diff --git a/pixi.lock b/pixi.lock index d4d281da2dd..65bfb0e61dc 100644 --- a/pixi.lock +++ b/pixi.lock @@ -10,233 +10,253 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-hf3520f5_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.2-h59595ed_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.1.0-h77fa898_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.0-hde9e2c9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4bc722e_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.5-h2ad013b_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1c/21/a6b46c91b4c9d1918ee59c305f46850cde7cbea748635a352e7c3c8ed204/mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/2c/f3/61eeef119beb37decb58e7cb29940f19a1464b8608f2cab8a8616aba75fd/numpy-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8e/ee/8a26858ca517e9c64f84b4c7734b89bda8e63bec85c3d2f432d225bb1886/scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . osx-64: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.7.4-h8857fd0_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.2-h73e2aa4_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.0-h1b8f9f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-h87427d6_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-h5846eda_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.1-h87427d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.12.5-h37a9e06_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3a/34/69638cee2e87303f19a0c35e80d42757e14d9aba328f272fdcdc0bf3c9b8/mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/64/1c/401489a7e92c30db413362756c313b9353fb47565015986c55582593e2ae/numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/a4/10/1be32aeaab8728f78f673e7a47dd813222364479b2d6573dbcf0085e83ea/ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c0/04/2bdacc8ac6387b15db6faa40295f8bd25eccf33f1f13e68a72dc3c60a99e/scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . osx-arm64: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.7.4-hf0a4a13_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.6.2-hebf3989_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.0-hfb93653_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-hfb2fe0b_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-hb89a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.1-hfb2fe0b_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.5-h30c5eda_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c4/3c/3e0611348fc53a4a7c80485959478b4f6eae706baf3b7c03cafa22639216/mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/08/61/460fb524bb2d1a8bd4bbcb33d9b0971f9837fdedcfda8478d4c8f5cfd7ee/numpy-2.0.1-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/3d/1d/c218ce83beb4394ba04d05e9aa2ae6ce9fba8405688fe878b0fdb40ce855/ruff-0.5.7-py3-none-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/c8/53/35b4d41f5fd42f5781dbd0dd6c05d35ba8aa75c84ecddc7d44756cd8da2e/scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . win-64: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.7.4-h56e8100_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.6.2-h63175ca_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.0-h2466b09_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.1-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.12.5-h889d299_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h8a93ad2_20.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1e/b7/3a50f318979c8c541428c2f1ee973cda813bcc89614de982dafdd0df2b3e/mypy-1.11.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/59/f6ad30785a6578ad85ed9c2785f271b39c3e5b6412c66e810d2c60934c9f/numpy-2.0.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/67/1c/4520c98bfc06b9c73cd1457686d4d3935d40046b1ddea08403e5a6deff51/ruff-0.5.7-py3-none-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/aa/7d/43ab67228ef98c6b5dd42ab386eae2d7877036970a0d7e3dd3eb47a0d530/scipy-1.14.1-cp312-cp312-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . packages: @@ -353,52 +373,52 @@ packages: timestamp: 1720974491916 - kind: conda name: ca-certificates - version: 2024.7.4 + version: 2024.8.30 build: h56e8100_0 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.7.4-h56e8100_0.conda - sha256: 7f37bb33c7954de1b4d19ad622859feb4f6c58f751c38b895524cad4e44af72e - md5: 9caa97c9504072cd060cf0a3142cc0ed + url: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda + sha256: 0fcac3a7ffcc556649e034a1802aedf795e64227eaa7194d207b01eaf26454c4 + md5: 4c4fd67c18619be5aa65dc5b6c72e490 license: ISC purls: [] - size: 154943 - timestamp: 1720077592592 + size: 158773 + timestamp: 1725019107649 - kind: conda name: ca-certificates - version: 2024.7.4 + version: 2024.8.30 build: h8857fd0_0 subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.7.4-h8857fd0_0.conda - sha256: d16f46c489cb3192305c7d25b795333c5fc17bb0986de20598ed519f8c9cc9e4 - md5: 7df874a4b05b2d2b82826190170eaa0f + url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda + sha256: 593f302d0f44c2c771e1614ee6d56fffdc7d616e6f187669c8b0e34ffce3e1ae + md5: b7e5424e7f06547a903d28e4651dbb21 license: ISC purls: [] - size: 154473 - timestamp: 1720077510541 + size: 158665 + timestamp: 1725019059295 - kind: conda name: ca-certificates - version: 2024.7.4 + version: 2024.8.30 build: hbcca054_0 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.7.4-hbcca054_0.conda - sha256: c1548a3235376f464f9931850b64b02492f379b2f2bb98bc786055329b080446 - md5: 23ab7665c5f63cfb9f1f6195256daac6 + url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda + sha256: afee721baa6d988e27fef1832f68d6f32ac8cc99cdf6015732224c2841a09cea + md5: c27d1c142233b5bc9ca570c6e2e0c244 license: ISC purls: [] - size: 154853 - timestamp: 1720077432978 + size: 159003 + timestamp: 1725018903918 - kind: conda name: ca-certificates - version: 2024.7.4 + version: 2024.8.30 build: hf0a4a13_0 subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.7.4-hf0a4a13_0.conda - sha256: 33a61116dae7f369b6ce92a7f2a1ff361ae737c675a493b11feb5570b89e0e3b - md5: 21f9a33e5fe996189e470c19c5354dbe + url: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda + sha256: 2db1733f4b644575dbbdd7994a8f338e6ef937f5ebdb74acd557e9dda0211709 + md5: 40dec13fd8348dbe303e57be74bd3d35 license: ISC purls: [] - size: 154517 - timestamp: 1720077468981 + size: 158482 + timestamp: 1725019034582 - kind: pypi name: cfgv version: 3.4.0 @@ -419,14 +439,14 @@ packages: requires_python: '>=3.5' - kind: pypi name: distlib - version: 0.3.8 - url: https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl - sha256: 034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 + version: 0.3.9 + url: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl + sha256: 47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 - kind: pypi name: executing - version: 2.0.1 - url: https://files.pythonhosted.org/packages/80/03/6ea8b1b2a5ab40a7a60dc464d3daa7aa546e0a74d74a9f8ff551ea7905db/executing-2.0.1-py2.py3-none-any.whl - sha256: eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc + version: 2.1.0 + url: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl + sha256: 8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf requires_dist: - asttokens>=2.1.0 ; extra == 'tests' - ipython ; extra == 'tests' @@ -435,26 +455,135 @@ packages: - coverage-enable-subprocess ; extra == 'tests' - littleutils ; extra == 'tests' - rich ; python_full_version >= '3.11' and extra == 'tests' - requires_python: '>=3.5' + requires_python: '>=3.8' - kind: pypi name: filelock - version: 3.15.4 - url: https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl - sha256: 6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 + version: 3.16.1 + url: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl + sha256: 2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 requires_dist: - - furo>=2023.9.10 ; extra == 'docs' - - sphinx-autodoc-typehints!=1.23.4,>=1.25.2 ; extra == 'docs' - - sphinx>=7.2.6 ; extra == 'docs' + - furo>=2024.8.6 ; extra == 'docs' + - sphinx-autodoc-typehints>=2.4.1 ; extra == 'docs' + - sphinx>=8.0.2 ; extra == 'docs' - covdefaults>=2.3 ; extra == 'testing' - - coverage>=7.3.2 ; extra == 'testing' - - diff-cover>=8.0.1 ; extra == 'testing' - - pytest-asyncio>=0.21 ; extra == 'testing' - - pytest-cov>=4.1 ; extra == 'testing' - - pytest-mock>=3.12 ; extra == 'testing' - - pytest-timeout>=2.2 ; extra == 'testing' - - pytest>=7.4.3 ; extra == 'testing' - - virtualenv>=20.26.2 ; extra == 'testing' - - typing-extensions>=4.8 ; python_full_version < '3.11' and extra == 'typing' + - coverage>=7.6.1 ; extra == 'testing' + - diff-cover>=9.2 ; extra == 'testing' + - pytest-asyncio>=0.24 ; extra == 'testing' + - pytest-cov>=5 ; extra == 'testing' + - pytest-mock>=3.14 ; extra == 'testing' + - pytest-timeout>=2.3.1 ; extra == 'testing' + - pytest>=8.3.3 ; extra == 'testing' + - virtualenv>=20.26.4 ; extra == 'testing' + - typing-extensions>=4.12.2 ; python_full_version < '3.11' and extra == 'typing' + requires_python: '>=3.8' +- kind: pypi + name: fsspec + version: 2024.10.0 + url: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl + sha256: 03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 + requires_dist: + - adlfs ; extra == 'abfs' + - adlfs ; extra == 'adl' + - pyarrow>=1 ; extra == 'arrow' + - dask ; extra == 'dask' + - distributed ; extra == 'dask' + - pre-commit ; extra == 'dev' + - ruff ; extra == 'dev' + - numpydoc ; extra == 'doc' + - sphinx ; extra == 'doc' + - sphinx-design ; extra == 'doc' + - sphinx-rtd-theme ; extra == 'doc' + - yarl ; extra == 'doc' + - dropbox ; extra == 'dropbox' + - dropboxdrivefs ; extra == 'dropbox' + - requests ; extra == 'dropbox' + - adlfs ; extra == 'full' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'full' + - dask ; extra == 'full' + - distributed ; extra == 'full' + - dropbox ; extra == 'full' + - dropboxdrivefs ; extra == 'full' + - fusepy ; extra == 'full' + - gcsfs ; extra == 'full' + - libarchive-c ; extra == 'full' + - ocifs ; extra == 'full' + - panel ; extra == 'full' + - paramiko ; extra == 'full' + - pyarrow>=1 ; extra == 'full' + - pygit2 ; extra == 'full' + - requests ; extra == 'full' + - s3fs ; extra == 'full' + - smbprotocol ; extra == 'full' + - tqdm ; extra == 'full' + - fusepy ; extra == 'fuse' + - gcsfs ; extra == 'gcs' + - pygit2 ; extra == 'git' + - requests ; extra == 'github' + - gcsfs ; extra == 'gs' + - panel ; extra == 'gui' + - pyarrow>=1 ; extra == 'hdfs' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'http' + - libarchive-c ; extra == 'libarchive' + - ocifs ; extra == 'oci' + - s3fs ; extra == 's3' + - paramiko ; extra == 'sftp' + - smbprotocol ; extra == 'smb' + - paramiko ; extra == 'ssh' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test' + - numpy ; extra == 'test' + - pytest ; extra == 'test' + - pytest-asyncio!=0.22.0 ; extra == 'test' + - pytest-benchmark ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-mock ; extra == 'test' + - pytest-recording ; extra == 'test' + - pytest-rerunfailures ; extra == 'test' + - requests ; extra == 'test' + - aiobotocore<3.0.0,>=2.5.4 ; extra == 'test-downstream' + - dask-expr ; extra == 'test-downstream' + - dask[dataframe,test] ; extra == 'test-downstream' + - moto[server]<5,>4 ; extra == 'test-downstream' + - pytest-timeout ; extra == 'test-downstream' + - xarray ; extra == 'test-downstream' + - adlfs ; extra == 'test-full' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test-full' + - cloudpickle ; extra == 'test-full' + - dask ; extra == 'test-full' + - distributed ; extra == 'test-full' + - dropbox ; extra == 'test-full' + - dropboxdrivefs ; extra == 'test-full' + - fastparquet ; extra == 'test-full' + - fusepy ; extra == 'test-full' + - gcsfs ; extra == 'test-full' + - jinja2 ; extra == 'test-full' + - kerchunk ; extra == 'test-full' + - libarchive-c ; extra == 'test-full' + - lz4 ; extra == 'test-full' + - notebook ; extra == 'test-full' + - numpy ; extra == 'test-full' + - ocifs ; extra == 'test-full' + - pandas ; extra == 'test-full' + - panel ; extra == 'test-full' + - paramiko ; extra == 'test-full' + - pyarrow ; extra == 'test-full' + - pyarrow>=1 ; extra == 'test-full' + - pyftpdlib ; extra == 'test-full' + - pygit2 ; extra == 'test-full' + - pytest ; extra == 'test-full' + - pytest-asyncio!=0.22.0 ; extra == 'test-full' + - pytest-benchmark ; extra == 'test-full' + - pytest-cov ; extra == 'test-full' + - pytest-mock ; extra == 'test-full' + - pytest-recording ; extra == 'test-full' + - pytest-rerunfailures ; extra == 'test-full' + - python-snappy ; extra == 'test-full' + - requests ; extra == 'test-full' + - smbprotocol ; extra == 'test-full' + - tqdm ; extra == 'test-full' + - urllib3 ; extra == 'test-full' + - zarr ; extra == 'test-full' + - zstandard ; extra == 'test-full' + - tqdm ; extra == 'tqdm' requires_python: '>=3.8' - kind: pypi name: hatchling @@ -470,17 +599,17 @@ packages: requires_python: '>=3.8' - kind: pypi name: identify - version: 2.6.0 - url: https://files.pythonhosted.org/packages/24/6c/a4f39abe7f19600b74528d0c717b52fff0b300bb0161081510d39c53cb00/identify-2.6.0-py2.py3-none-any.whl - sha256: e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0 + version: 2.6.1 + url: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl + sha256: 53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0 requires_dist: - ukkonen ; extra == 'license' requires_python: '>=3.8' - kind: pypi name: ipython - version: 8.26.0 - url: https://files.pythonhosted.org/packages/73/48/4d2818054671bb272d1b12ca65748a4145dc602a463683b5c21b260becee/ipython-8.26.0-py3-none-any.whl - sha256: e6b347c27bdf9c32ee9d31ae85defc525755a1869f14057e900675b9e8d6e6ff + version: 8.28.0 + url: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl + sha256: 530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35 requires_dist: - decorator - jedi>=0.16 @@ -572,82 +701,22 @@ packages: requires_python: '>=3.6' - kind: conda name: ld_impl_linux-64 - version: '2.40' - build: hf3520f5_7 - build_number: 7 + version: '2.43' + build: h712a8e2_1 + build_number: 1 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-hf3520f5_7.conda - sha256: 764b6950aceaaad0c67ef925417594dd14cd2e22fff864aeef455ac259263d15 - md5: b80f2f396ca2c28b8c14c437a4ed1e74 + url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda + sha256: 0c21387f9a411e3d1f7f2969026bacfece133c8f1e72faea9cde29c0c19e1f3a + md5: 83e1364586ceb8d0739fbc85b5c95837 + depends: + - __glibc >=2.17,<3.0.a0 constrains: - - binutils_impl_linux-64 2.40 + - binutils_impl_linux-64 2.43 license: GPL-3.0-only license_family: GPL purls: [] - size: 707602 - timestamp: 1718625640445 -- kind: conda - name: libexpat - version: 2.6.2 - build: h59595ed_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.2-h59595ed_0.conda - sha256: 331bb7c7c05025343ebd79f86ae612b9e1e74d2687b8f3179faec234f986ce19 - md5: e7ba12deb7020dd080c6c70e7b6f6a3d - depends: - - libgcc-ng >=12 - constrains: - - expat 2.6.2.* - license: MIT - license_family: MIT - purls: [] - size: 73730 - timestamp: 1710362120304 -- kind: conda - name: libexpat - version: 2.6.2 - build: h63175ca_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.6.2-h63175ca_0.conda - sha256: 79f612f75108f3e16bbdc127d4885bb74729cf66a8702fca0373dad89d40c4b7 - md5: bc592d03f62779511d392c175dcece64 - constrains: - - expat 2.6.2.* - license: MIT - license_family: MIT - purls: [] - size: 139224 - timestamp: 1710362609641 -- kind: conda - name: libexpat - version: 2.6.2 - build: h73e2aa4_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.2-h73e2aa4_0.conda - sha256: a188a77b275d61159a32ab547f7d17892226e7dac4518d2c6ac3ac8fc8dfde92 - md5: 3d1d51c8f716d97c864d12f7af329526 - constrains: - - expat 2.6.2.* - license: MIT - license_family: MIT - purls: [] - size: 69246 - timestamp: 1710362566073 -- kind: conda - name: libexpat - version: 2.6.2 - build: hebf3989_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.6.2-hebf3989_0.conda - sha256: ba7173ac30064ea901a4c9fb5a51846dcc25512ceb565759be7d18cbf3e5415e - md5: e3cde7cfa87f82f7cb13d482d5e0ad09 - constrains: - - expat 2.6.2.* - license: MIT - license_family: MIT - purls: [] - size: 63655 - timestamp: 1710362424980 + size: 669616 + timestamp: 1727304687962 - kind: conda name: libffi version: 3.4.2 @@ -710,38 +779,57 @@ packages: size: 42063 timestamp: 1636489106777 - kind: conda - name: libgcc-ng - version: 14.1.0 - build: h77fa898_0 + name: libgcc + version: 14.2.0 + build: h77fa898_1 + build_number: 1 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.1.0-h77fa898_0.conda - sha256: b8e869ac96591cda2704bf7e77a301025e405227791a0bddf14a3dac65125538 - md5: ca0fad6a41ddaef54a153b78eccb5037 + url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda + sha256: 53eb8a79365e58849e7b1a068d31f4f9e718dc938d6f2c03e960345739a03569 + md5: 3cb76c3f10d3bc7f1105b2fc9db984df depends: - _libgcc_mutex 0.1 conda_forge - _openmp_mutex >=4.5 constrains: - - libgomp 14.1.0 h77fa898_0 + - libgomp 14.2.0 h77fa898_1 + - libgcc-ng ==14.2.0=*_1 license: GPL-3.0-only WITH GCC-exception-3.1 license_family: GPL purls: [] - size: 842109 - timestamp: 1719538896937 + size: 848745 + timestamp: 1729027721139 +- kind: conda + name: libgcc-ng + version: 14.2.0 + build: h69a702a_1 + build_number: 1 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda + sha256: 3a76969c80e9af8b6e7a55090088bc41da4cffcde9e2c71b17f44d37b7cb87f7 + md5: e39480b9ca41323497b05492a63bc35b + depends: + - libgcc 14.2.0 h77fa898_1 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 54142 + timestamp: 1729027726517 - kind: conda name: libgomp - version: 14.1.0 - build: h77fa898_0 + version: 14.2.0 + build: h77fa898_1 + build_number: 1 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.1.0-h77fa898_0.conda - sha256: 7699df61a1f6c644b3576a40f54791561f2845983120477a16116b951c9cdb05 - md5: ae061a5ed5f05818acdf9adab72c146d + url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda + sha256: 1911c29975ec99b6b906904040c855772ccb265a1c79d5d75c8ceec4ed89cd63 + md5: cc3573974587f12dda90d96e3e55a702 depends: - _libgcc_mutex 0.1 conda_forge license: GPL-3.0-only WITH GCC-exception-3.1 license_family: GPL purls: [] - size: 456925 - timestamp: 1719538796073 + size: 460992 + timestamp: 1729027639220 - kind: conda name: libnsl version: 2.0.1 @@ -759,65 +847,66 @@ packages: timestamp: 1697359010159 - kind: conda name: libsqlite - version: 3.46.0 - build: h1b8f9f3_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.0-h1b8f9f3_0.conda - sha256: 63af1a9e3284c7e4952364bafe7267e41e2d9d8bcc0e85a4ea4b0ec02d3693f6 - md5: 5dadfbc1a567fe6e475df4ce3148be09 - depends: - - __osx >=10.13 - - libzlib >=1.2.13,<2.0a0 - license: Unlicense - purls: [] - size: 908643 - timestamp: 1718050720117 -- kind: conda - name: libsqlite - version: 3.46.0 + version: 3.46.1 build: h2466b09_0 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.0-h2466b09_0.conda - sha256: 662bd7e0d63c5b8c31cca19b91649e798319b93568a2ba8d1375efb91eeb251b - md5: 951b0a3a463932e17414cd9f047fa03d + url: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda + sha256: ef83f90961630bc54a95e48062b05cf9c9173a822ea01784288029613a45eea4 + md5: 8a7c1ad01f58623bfbae8d601db7cf3b depends: - ucrt >=10.0.20348.0 - vc >=14.2,<15 - vc14_runtime >=14.29.30139 license: Unlicense purls: [] - size: 876677 - timestamp: 1718051113874 + size: 876666 + timestamp: 1725354171439 +- kind: conda + name: libsqlite + version: 3.46.1 + build: h4b8f8c9_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda + sha256: 1d075cb823f0cad7e196871b7c57961d669cbbb6cd0e798bf50cbf520dda65fb + md5: 84de0078b58f899fc164303b0603ff0e + depends: + - __osx >=10.13 + - libzlib >=1.3.1,<2.0a0 + license: Unlicense + purls: [] + size: 908317 + timestamp: 1725353652135 - kind: conda name: libsqlite - version: 3.46.0 - build: hde9e2c9_0 + version: 3.46.1 + build: hadc24fc_0 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.0-hde9e2c9_0.conda - sha256: daee3f68786231dad457d0dfde3f7f1f9a7f2018adabdbb864226775101341a8 - md5: 18aa975d2094c34aef978060ae7da7d8 + url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda + sha256: 9851c049abafed3ee329d6c7c2033407e2fc269d33a75c071110ab52300002b0 + md5: 36f79405ab16bf271edb55b213836dac depends: - - libgcc-ng >=12 - - libzlib >=1.2.13,<2.0a0 + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 865346 - timestamp: 1718050628718 + size: 865214 + timestamp: 1725353659783 - kind: conda name: libsqlite - version: 3.46.0 - build: hfb93653_0 + version: 3.46.1 + build: hc14010f_0 subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.0-hfb93653_0.conda - sha256: 73048f9cb8647d3d3bfe6021c0b7d663e12cffbe9b4f31bd081e713b0a9ad8f9 - md5: 12300188028c9bc02da965128b91b517 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda + sha256: 3725f962f490c5d44dae326d5f5b2e3c97f71a6322d914ccc85b5ddc2e50d120 + md5: 58050ec1724e58668d0126a1615553fa depends: - __osx >=11.0 - - libzlib >=1.2.13,<2.0a0 + - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 830198 - timestamp: 1718050644825 + size: 829500 + timestamp: 1725353720793 - kind: conda name: libuuid version: 2.38.1 @@ -833,95 +922,81 @@ packages: purls: [] size: 33601 timestamp: 1680112270483 -- kind: conda - name: libxcrypt - version: 4.4.36 - build: hd590300_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c - md5: 5aa797f8787fe7a17d1b0821485b5adc - depends: - - libgcc-ng >=12 - license: LGPL-2.1-or-later - purls: [] - size: 100393 - timestamp: 1702724383534 - kind: conda name: libzlib version: 1.3.1 - build: h2466b09_1 - build_number: 1 + build: h2466b09_2 + build_number: 2 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_1.conda - sha256: b13846a54a15243e15f96fec06b526d8155adc6a1ac2b6ed47a88f6a71a94b68 - md5: d4483ca8afc57ddf1f6dded53b36c17f + url: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda + sha256: ba945c6493449bed0e6e29883c4943817f7c79cbff52b83360f7b341277c6402 + md5: 41fbfac52c601159df6c01f875de31b9 depends: - ucrt >=10.0.20348.0 - vc >=14.2,<15 - vc14_runtime >=14.29.30139 constrains: - - zlib 1.3.1 *_1 + - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 56186 - timestamp: 1716874730539 + size: 55476 + timestamp: 1727963768015 - kind: conda name: libzlib version: 1.3.1 - build: h4ab18f5_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda - sha256: adf6096f98b537a11ae3729eaa642b0811478f0ea0402ca67b5108fe2cb0010d - md5: 57d7dc60e9325e3de37ff8dffd18e814 + build: h8359307_2 + build_number: 2 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda + sha256: ce34669eadaba351cd54910743e6a2261b67009624dbc7daeeafdef93616711b + md5: 369964e85dc26bfe78f41399b366c435 depends: - - libgcc-ng >=12 + - __osx >=11.0 constrains: - - zlib 1.3.1 *_1 + - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 61574 - timestamp: 1716874187109 + size: 46438 + timestamp: 1727963202283 - kind: conda name: libzlib version: 1.3.1 - build: h87427d6_1 - build_number: 1 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-h87427d6_1.conda - sha256: 80a62db652b1da0ccc100812a1d86e94f75028968991bfb17f9536f3aa72d91d - md5: b7575b5aa92108dcc9aaab0f05f2dbce + build: hb9d3cd8_2 + build_number: 2 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + md5: edb0dca6bc32e4f4789199455a1dbeb8 depends: - - __osx >=10.13 + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 constrains: - - zlib 1.3.1 *_1 + - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 57372 - timestamp: 1716874211519 + size: 60963 + timestamp: 1727963148474 - kind: conda name: libzlib version: 1.3.1 - build: hfb2fe0b_1 - build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-hfb2fe0b_1.conda - sha256: c34365dd37b0eab27b9693af32a1f7f284955517c2cc91f1b88a7ef4738ff03e - md5: 636077128927cf79fd933276dc3aed47 + build: hd23fc13_2 + build_number: 2 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda + sha256: 8412f96504fc5993a63edf1e211d042a1fd5b1d51dedec755d2058948fcced09 + md5: 003a54a4e32b02f7355b50a837e699da depends: - - __osx >=11.0 + - __osx >=10.13 constrains: - - zlib 1.3.1 *_1 + - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 46921 - timestamp: 1716874262512 + size: 57133 + timestamp: 1727963183990 - kind: pypi name: matplotlib-inline version: 0.1.7 @@ -932,9 +1007,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: mypy - version: 1.11.1 - url: https://files.pythonhosted.org/packages/1c/21/a6b46c91b4c9d1918ee59c305f46850cde7cbea748635a352e7c3c8ed204/mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl - sha256: b868d3bcff720dd7217c383474008ddabaf048fad8d78ed948bb4b624870a417 + version: 1.12.1 + url: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl + sha256: 1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735 requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 @@ -946,9 +1021,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: mypy - version: 1.11.1 - url: https://files.pythonhosted.org/packages/1e/b7/3a50f318979c8c541428c2f1ee973cda813bcc89614de982dafdd0df2b3e/mypy-1.11.1-cp312-cp312-win_amd64.whl - sha256: 64f4a90e3ea07f590c5bcf9029035cf0efeae5ba8be511a8caada1a4893f5525 + version: 1.12.1 + url: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl + sha256: a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6 requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 @@ -960,9 +1035,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: mypy - version: 1.11.1 - url: https://files.pythonhosted.org/packages/3a/34/69638cee2e87303f19a0c35e80d42757e14d9aba328f272fdcdc0bf3c9b8/mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl - sha256: f39918a50f74dc5969807dcfaecafa804fa7f90c9d60506835036cc1bc891dc8 + version: 1.12.1 + url: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl + sha256: 673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811 requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 @@ -974,9 +1049,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: mypy - version: 1.11.1 - url: https://files.pythonhosted.org/packages/c4/3c/3e0611348fc53a4a7c80485959478b4f6eae706baf3b7c03cafa22639216/mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl - sha256: 0bc71d1fb27a428139dd78621953effe0d208aed9857cb08d002280b0422003a + version: 1.12.1 + url: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl + sha256: 02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66 requires_dist: - typing-extensions>=4.6.0 - mypy-extensions>=1.0.0 @@ -995,150 +1070,170 @@ packages: - kind: conda name: ncurses version: '6.5' - build: h5846eda_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-h5846eda_0.conda - sha256: 6ecc73db0e49143092c0934355ac41583a5d5a48c6914c5f6ca48e562d3a4b79 - md5: 02a888433d165c99bf09784a7b14d900 + build: h7bae524_1 + build_number: 1 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda + sha256: 27d0b9ff78ad46e1f3a6c96c479ab44beda5f96def88e2fe626e0a49429d8afc + md5: cb2b0ea909b97b3d70cd3921d1445e1a + depends: + - __osx >=11.0 license: X11 AND BSD-3-Clause purls: [] - size: 823601 - timestamp: 1715195267791 + size: 802321 + timestamp: 1724658775723 - kind: conda name: ncurses version: '6.5' - build: h59595ed_0 + build: he02047a_1 + build_number: 1 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda - sha256: 4fc3b384f4072b68853a0013ea83bdfd3d66b0126e2238e1d6e1560747aa7586 - md5: fcea371545eda051b6deafb24889fc69 + url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda + sha256: 6a1d5d8634c1a07913f1c525db6455918cbc589d745fac46d9d6e30340c8731a + md5: 70caf8bb6cf39a0b6b7efc885f51c0fe depends: + - __glibc >=2.17,<3.0.a0 - libgcc-ng >=12 license: X11 AND BSD-3-Clause purls: [] - size: 887465 - timestamp: 1715194722503 + size: 889086 + timestamp: 1724658547447 - kind: conda name: ncurses version: '6.5' - build: hb89a1cb_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-hb89a1cb_0.conda - sha256: 87d7cf716d9d930dab682cb57b3b8d3a61940b47d6703f3529a155c938a6990a - md5: b13ad5724ac9ae98b6b4fd87e4500ba4 + build: hf036a51_1 + build_number: 1 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda + sha256: b0b3180039ef19502525a2abd5833c00f9624af830fd391f851934d57bffb9af + md5: e102bbf8a6ceeaf429deab8032fc8977 + depends: + - __osx >=10.13 license: X11 AND BSD-3-Clause purls: [] - size: 795131 - timestamp: 1715194898402 + size: 822066 + timestamp: 1724658603042 - kind: pypi name: nodeenv version: 1.9.1 url: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl sha256: ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' +- kind: pypi + name: nodejs-wheel-binaries + version: 20.18.0 + url: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl + sha256: f95fb0989dfc54fd6932850e589000a8d6fc902527cebe7afd747696561d94b8 + requires_python: '>=3.7' +- kind: pypi + name: nodejs-wheel-binaries + version: 20.18.0 + url: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 33b138288dbeb9aafc6d54f43fbca6545b37e8fd9cbb8f68275ff2a47d4fed07 + requires_python: '>=3.7' +- kind: pypi + name: nodejs-wheel-binaries + version: 20.18.0 + url: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl + sha256: 74273eab1c2423c04d034d3f707f517da32d3a2b20ca244b5667f3a4e38003ac + requires_python: '>=3.7' +- kind: pypi + name: nodejs-wheel-binaries + version: 20.18.0 + url: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl + sha256: 51c0cecb429a111351a54346909e672a57b96233a363c79cc0a2bbdbfa397304 + requires_python: '>=3.7' - kind: pypi name: numpy - version: 2.0.1 - url: https://files.pythonhosted.org/packages/08/61/460fb524bb2d1a8bd4bbcb33d9b0971f9837fdedcfda8478d4c8f5cfd7ee/numpy-2.0.1-cp312-cp312-macosx_11_0_arm64.whl - sha256: 7d6fddc5fe258d3328cd8e3d7d3e02234c5d70e01ebe377a6ab92adb14039cb4 - requires_python: '>=3.9' + version: 2.1.2 + url: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl + sha256: faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1 + requires_python: '>=3.10' - kind: pypi name: numpy - version: 2.0.1 - url: https://files.pythonhosted.org/packages/2c/f3/61eeef119beb37decb58e7cb29940f19a1464b8608f2cab8a8616aba75fd/numpy-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - sha256: 6790654cb13eab303d8402354fabd47472b24635700f631f041bd0b65e37298a - requires_python: '>=3.9' + version: 2.1.2 + url: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: e2b49c3c0804e8ecb05d59af8386ec2f74877f7ca8fd9c1e00be2672e4d399b1 + requires_python: '>=3.10' - kind: pypi name: numpy - version: 2.0.1 - url: https://files.pythonhosted.org/packages/64/1c/401489a7e92c30db413362756c313b9353fb47565015986c55582593e2ae/numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl - sha256: 6bf4e6f4a2a2e26655717a1983ef6324f2664d7011f6ef7482e8c0b3d51e82ac - requires_python: '>=3.9' + version: 2.1.2 + url: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl + sha256: b42a1a511c81cc78cbc4539675713bbcf9d9c3913386243ceff0e9429ca892fe + requires_python: '>=3.10' - kind: pypi name: numpy - version: 2.0.1 - url: https://files.pythonhosted.org/packages/b5/59/f6ad30785a6578ad85ed9c2785f271b39c3e5b6412c66e810d2c60934c9f/numpy-2.0.1-cp312-cp312-win_amd64.whl - sha256: bb2124fdc6e62baae159ebcfa368708867eb56806804d005860b6007388df171 - requires_python: '>=3.9' + version: 2.1.2 + url: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl + sha256: f1eb068ead09f4994dec71c24b2844f1e4e4e013b9629f812f292f04bd1510d9 + requires_python: '>=3.10' - kind: conda name: openssl - version: 3.3.1 - build: h2466b09_2 - build_number: 2 + version: 3.3.2 + build: h2466b09_0 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.1-h2466b09_2.conda - sha256: d86c4fa31294ad9068717788197e97e5637e056c82745ffb6d0e88fd1fef1a9d - md5: 375dbc2a4d5a2e4c738703207e8e368b + url: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda + sha256: a45c42f3577294e22ac39ddb6ef5a64fd5322e8a6725afefbf4f2b4109340bf9 + md5: 1dc86753693df5e3326bb8a85b74c589 depends: - ca-certificates - ucrt >=10.0.20348.0 - vc >=14.2,<15 - vc14_runtime >=14.29.30139 - constrains: - - pyopenssl >=22.1 license: Apache-2.0 license_family: Apache purls: [] - size: 8385012 - timestamp: 1721197465883 + size: 8396053 + timestamp: 1725412961673 - kind: conda name: openssl - version: 3.3.1 - build: h4bc722e_2 - build_number: 2 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4bc722e_2.conda - sha256: b294b3cc706ad1048cdb514f0db3da9f37ae3fcc0c53a7104083dd0918adb200 - md5: e1b454497f9f7c1147fdde4b53f1b512 + version: 3.3.2 + build: h8359307_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda + sha256: 940fa01c4dc6152158fe8943e05e55a1544cab639df0994e3b35937839e4f4d1 + md5: 1773ebccdc13ec603356e8ff1db9e958 depends: - - __glibc >=2.17,<3.0.a0 + - __osx >=11.0 - ca-certificates - - libgcc-ng >=12 - constrains: - - pyopenssl >=22.1 license: Apache-2.0 license_family: Apache purls: [] - size: 2895213 - timestamp: 1721194688955 + size: 2882450 + timestamp: 1725410638874 - kind: conda name: openssl - version: 3.3.1 - build: h87427d6_2 - build_number: 2 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.1-h87427d6_2.conda - sha256: 3cb0c05fbfd8cdb9b767396fc0e0af2d78eb4d68592855481254104330d4a4eb - md5: 3f3dbeedbee31e257866407d9dea1ff5 + version: 3.3.2 + build: hb9d3cd8_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda + sha256: cee91036686419f6dd6086902acf7142b4916e1c4ba042e9ca23e151da012b6d + md5: 4d638782050ab6faa27275bed57e9b4e depends: - - __osx >=10.13 + - __glibc >=2.17,<3.0.a0 - ca-certificates - constrains: - - pyopenssl >=22.1 + - libgcc >=13 license: Apache-2.0 license_family: Apache purls: [] - size: 2552939 - timestamp: 1721194674491 + size: 2891789 + timestamp: 1725410790053 - kind: conda name: openssl - version: 3.3.1 - build: hfb2fe0b_2 - build_number: 2 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.1-hfb2fe0b_2.conda - sha256: dd7d988636f74473ebdfe15e05c5aabdb53a1d2a846c839d62289b0c37f81548 - md5: 9b551a504c1cc8f8b7b22c01814da8ba + version: 3.3.2 + build: hd23fc13_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda + sha256: 2b75d4b56e45992adf172b158143742daeb316c35274b36f385ccb6644e93268 + md5: 2ff47134c8e292868a4609519b1ea3b6 depends: - - __osx >=11.0 + - __osx >=10.13 - ca-certificates - constrains: - - pyopenssl >=22.1 license: Apache-2.0 license_family: Apache purls: [] - size: 2899682 - timestamp: 1721194599446 + size: 2544654 + timestamp: 1725410973572 - kind: pypi name: packaging version: '24.1' @@ -1147,13 +1242,13 @@ packages: requires_python: '>=3.8' - kind: pypi name: pandas-stubs - version: 2.2.2.240807 - url: https://files.pythonhosted.org/packages/0a/f9/22c91632ea1b4c6165952f677bf9ad95f9ac36ffd7ef3e6450144e6d8b1a/pandas_stubs-2.2.2.240807-py3-none-any.whl - sha256: 893919ad82be4275f0d07bb47a95d08bae580d3fdea308a7acfcb3f02e76186e + version: 2.2.3.241009 + url: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl + sha256: 3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa requires_dist: - numpy>=1.23.5 - types-pytz>=2022.1.1 - requires_python: '>=3.9' + requires_python: '>=3.10' - kind: pypi name: parso version: 0.8.4 @@ -1179,22 +1274,42 @@ packages: sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 requires_dist: - ptyprocess>=0.5 +- kind: conda + name: pip + version: '24.2' + build: pyh8b19718_1 + build_number: 1 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda + sha256: d820e5358bcb117fa6286e55d4550c60b0332443df62121df839eab2d11c890b + md5: 6c78fbb8ddfd64bcb55b5cbafd2d2c43 + depends: + - python >=3.8,<3.13.0a0 + - setuptools + - wheel + license: MIT + license_family: MIT + purls: + - pkg:pypi/pip?source=hash-mapping + size: 1237976 + timestamp: 1724954490262 - kind: pypi name: platformdirs - version: 4.2.2 - url: https://files.pythonhosted.org/packages/68/13/2aa1f0e1364feb2c9ef45302f387ac0bd81484e9c9a4c5688a322fbdfd08/platformdirs-4.2.2-py3-none-any.whl - sha256: 2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee + version: 4.3.6 + url: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl + sha256: 73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb requires_dist: - - furo>=2023.9.10 ; extra == 'docs' - - proselint>=0.13 ; extra == 'docs' - - sphinx-autodoc-typehints>=1.25.2 ; extra == 'docs' - - sphinx>=7.2.6 ; extra == 'docs' + - furo>=2024.8.6 ; extra == 'docs' + - proselint>=0.14 ; extra == 'docs' + - sphinx-autodoc-typehints>=2.4 ; extra == 'docs' + - sphinx>=8.0.2 ; extra == 'docs' - appdirs==1.4.4 ; extra == 'test' - covdefaults>=2.3 ; extra == 'test' - - pytest-cov>=4.1 ; extra == 'test' - - pytest-mock>=3.12 ; extra == 'test' - - pytest>=7.4.3 ; extra == 'test' - - mypy>=1.8 ; extra == 'type' + - pytest-cov>=5 ; extra == 'test' + - pytest-mock>=3.14 ; extra == 'test' + - pytest>=8.3.2 ; extra == 'test' + - mypy>=1.11.2 ; extra == 'type' requires_python: '>=3.8' - kind: pypi name: pluggy @@ -1209,9 +1324,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pre-commit - version: 3.8.0 - url: https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl - sha256: 9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f + version: 4.0.1 + url: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl + sha256: efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878 requires_dist: - cfgv>=2.0.0 - identify>=1.0.0 @@ -1221,9 +1336,9 @@ packages: requires_python: '>=3.9' - kind: pypi name: prompt-toolkit - version: 3.0.47 - url: https://files.pythonhosted.org/packages/e8/23/22750c4b768f09386d1c3cc4337953e8936f48a888fa6dddfb669b2c9088/prompt_toolkit-3.0.47-py3-none-any.whl - sha256: 0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10 + version: 3.0.48 + url: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl + sha256: f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e requires_dist: - wcwidth requires_python: '>=3.7.0' @@ -1242,8 +1357,8 @@ packages: - kind: pypi name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl - sha256: f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053 + url: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl + sha256: a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03 requires_dist: - numpy>=1.16.6 - pytest ; extra == 'test' @@ -1255,8 +1370,8 @@ packages: - kind: pypi name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl - sha256: 392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7 + url: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl + sha256: e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4 requires_dist: - numpy>=1.16.6 - pytest ; extra == 'test' @@ -1268,8 +1383,8 @@ packages: - kind: pypi name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl - sha256: 9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22 + url: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl + sha256: 2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3 requires_dist: - numpy>=1.16.6 - pytest ; extra == 'test' @@ -1281,8 +1396,8 @@ packages: - kind: pypi name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl - sha256: b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b + url: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl + sha256: 1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977 requires_dist: - numpy>=1.16.6 - pytest ; extra == 'test' @@ -1295,7 +1410,7 @@ packages: name: pyarrow-stubs version: '17.9' path: . - sha256: 78ad9aa13f92b8ac322f4c09edeb064b3e3c412b70d661e192ef6aabe143f0e5 + sha256: 7a6c58e69d86f0d33726eec03e3c168c9bf7cabbef3a20cd64419de2f7489a69 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' @@ -1308,142 +1423,149 @@ packages: requires_dist: - colorama>=0.4.6 ; extra == 'windows-terminal' requires_python: '>=3.8' +- kind: pypi + name: pyright + version: 1.1.385 + url: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl + sha256: e5b9a1b8d492e13004d822af94d07d235f2c7c158457293b51ab2214c8c5b375 + requires_dist: + - nodeenv>=1.6.0 + - typing-extensions>=4.1 + - twine>=3.4.1 ; extra == 'all' + - nodejs-wheel-binaries ; extra == 'all' + - twine>=3.4.1 ; extra == 'dev' + - nodejs-wheel-binaries ; extra == 'nodejs' + requires_python: '>=3.7' - kind: conda name: python - version: 3.12.5 - build: h2ad013b_0_cpython - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.5-h2ad013b_0_cpython.conda - sha256: e2aad83838988725d4ffba4e9717b9328054fd18a668cff3377e0c50f109e8bd - md5: 9c56c4df45f6571b13111d8df2448692 + version: 3.11.0 + build: h3ba56d0_1_cpython + build_number: 1 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda + sha256: 28a54d78cd2624a12bd2ceb0f1816b0cba9b4fd97df846b5843b3c1d51642ab2 + md5: 2aa7ca3702d9afd323ca34a9d98879d1 depends: - - __glibc >=2.17,<3.0.a0 - bzip2 >=1.0.8,<2.0a0 - - ld_impl_linux-64 >=2.36.1 - - libexpat >=2.6.2,<3.0a0 - libffi >=3.4,<4.0a0 - - libgcc-ng >=12 - - libnsl >=2.0.1,<2.1.0a0 - - libsqlite >=3.46.0,<4.0a0 - - libuuid >=2.38.1,<3.0a0 - - libxcrypt >=4.4.36 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.3.1,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 + - libsqlite >=3.40.0,<4.0a0 + - libzlib >=1.2.13,<2.0.0a0 + - ncurses >=6.3,<7.0a0 + - openssl >=3.0.7,<4.0a0 + - readline >=8.1.2,<9.0a0 + - tk >=8.6.12,<8.7.0a0 - tzdata - xz >=5.2.6,<6.0a0 constrains: - - python_abi 3.12.* *_cp312 + - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 31663253 - timestamp: 1723143721353 + size: 14492975 + timestamp: 1673699560906 - kind: conda name: python - version: 3.12.5 - build: h30c5eda_0_cpython - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.5-h30c5eda_0_cpython.conda - sha256: 1319e918fb54c9491832a9731cad00235a76f61c6f9b23fc0f70cdfb74c950ea - md5: 5e315581e2948dfe3bcac306540e9803 + version: 3.11.0 + build: hcf16a7b_0_cpython + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 + sha256: 20d1f1b5dc620b745c325844545fd5c0cdbfdb2385a0e27ef1507399844c8c6d + md5: 13ee3577afc291dabd2d9edc59736688 depends: - - __osx >=11.0 - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.6.2,<3.0a0 - - libffi >=3.4,<4.0a0 - - libsqlite >=3.46.0,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.3.1,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 + - libffi >=3.4.2,<3.5.0a0 + - libsqlite >=3.39.4,<4.0a0 + - libzlib >=1.2.13,<2.0.0a0 + - openssl >=3.0.5,<4.0a0 + - tk >=8.6.12,<8.7.0a0 - tzdata - - xz >=5.2.6,<6.0a0 + - vc >=14.1,<15 + - vs2015_runtime >=14.16.27033 + - xz >=5.2.6,<5.3.0a0 constrains: - - python_abi 3.12.* *_cp312 + - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 12926356 - timestamp: 1723142203193 + size: 19819816 + timestamp: 1666678800085 - kind: conda name: python - version: 3.12.5 - build: h37a9e06_0_cpython - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/python-3.12.5-h37a9e06_0_cpython.conda - sha256: c0f39e625b2fd65f70a9cc086fe4b25cc72228453dbbcd92cd5d140d080e38c5 - md5: 517cb4e16466f8d96ba2a72897d14c48 + version: 3.11.0 + build: he550d4f_1_cpython + build_number: 1 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda + sha256: 464f998e406b645ba34771bb53a0a7c2734e855ee78dd021aa4dedfdb65659b7 + md5: 8d14fc2aa12db370a443753c8230be1e depends: - - __osx >=10.13 - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.6.2,<3.0a0 + - ld_impl_linux-64 >=2.36.1 - libffi >=3.4,<4.0a0 - - libsqlite >=3.46.0,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.3.1,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 + - libgcc-ng >=12 + - libnsl >=2.0.0,<2.1.0a0 + - libsqlite >=3.40.0,<4.0a0 + - libuuid >=2.32.1,<3.0a0 + - libzlib >=1.2.13,<2.0.0a0 + - ncurses >=6.3,<7.0a0 + - openssl >=3.0.7,<4.0a0 + - readline >=8.1.2,<9.0a0 + - tk >=8.6.12,<8.7.0a0 - tzdata - xz >=5.2.6,<6.0a0 constrains: - - python_abi 3.12.* *_cp312 + - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 12173272 - timestamp: 1723142761765 + size: 31476523 + timestamp: 1673700777998 - kind: conda name: python - version: 3.12.5 - build: h889d299_0_cpython - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/python-3.12.5-h889d299_0_cpython.conda - sha256: 4cef304eb8877fd3094c14b57097ccc1b817b4afbf2223dd45d2b61e44064740 - md5: db056d8b140ab2edd56a2f9bdb203dcd + version: 3.11.0 + build: he7542f4_1_cpython + build_number: 1 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda + sha256: 5c069c9908e48a4490a56d3752c0bc93c2fc93ab8d8328efc869fdc707618e9f + md5: 9ecfa530b33aefd0d22e0272336f638a depends: - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.6.2,<3.0a0 - libffi >=3.4,<4.0a0 - - libsqlite >=3.46.0,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - openssl >=3.3.1,<4.0a0 - - tk >=8.6.13,<8.7.0a0 + - libsqlite >=3.40.0,<4.0a0 + - libzlib >=1.2.13,<2.0.0a0 + - ncurses >=6.3,<7.0a0 + - openssl >=3.0.7,<4.0a0 + - readline >=8.1.2,<9.0a0 + - tk >=8.6.12,<8.7.0a0 - tzdata - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - xz >=5.2.6,<6.0a0 constrains: - - python_abi 3.12.* *_cp312 + - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 15897752 - timestamp: 1723141830317 + size: 15410083 + timestamp: 1673762717308 - kind: pypi name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl - sha256: 7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 + url: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: 3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 requires_python: '>=3.8' - kind: pypi name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl - sha256: c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab + url: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl + sha256: 1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee requires_python: '>=3.8' - kind: pypi name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl - sha256: ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 + url: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl + sha256: e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 requires_python: '>=3.8' - kind: pypi name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - sha256: 80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 + url: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl + sha256: cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 requires_python: '>=3.8' - kind: conda name: readline @@ -1496,33 +1618,33 @@ packages: timestamp: 1679532707590 - kind: pypi name: ruff - version: 0.5.7 - url: https://files.pythonhosted.org/packages/3d/1d/c218ce83beb4394ba04d05e9aa2ae6ce9fba8405688fe878b0fdb40ce855/ruff-0.5.7-py3-none-macosx_11_0_arm64.whl - sha256: eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e + version: 0.7.0 + url: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl + sha256: ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2 requires_python: '>=3.7' - kind: pypi name: ruff - version: 0.5.7 - url: https://files.pythonhosted.org/packages/67/1c/4520c98bfc06b9c73cd1457686d4d3935d40046b1ddea08403e5a6deff51/ruff-0.5.7-py3-none-win_amd64.whl - sha256: 083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3 + version: 0.7.0 + url: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9 requires_python: '>=3.7' - kind: pypi name: ruff - version: 0.5.7 - url: https://files.pythonhosted.org/packages/a4/10/1be32aeaab8728f78f673e7a47dd813222364479b2d6573dbcf0085e83ea/ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl - sha256: 00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be + version: 0.7.0 + url: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl + sha256: 214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06 requires_python: '>=3.7' - kind: pypi name: ruff - version: 0.5.7 - url: https://files.pythonhosted.org/packages/c8/3b/2b683be597bbd02046678fc3fc1c199c641512b20212073b58f173822bb3/ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - sha256: 8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e + version: 0.7.0 + url: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl + sha256: 496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737 requires_python: '>=3.7' - kind: pypi name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/8e/ee/8a26858ca517e9c64f84b4c7734b89bda8e63bec85c3d2f432d225bb1886/scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - sha256: 8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 + url: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + sha256: fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 requires_dist: - numpy<2.3,>=1.23.5 - pytest ; extra == 'test' @@ -1563,8 +1685,8 @@ packages: - kind: pypi name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/aa/7d/43ab67228ef98c6b5dd42ab386eae2d7877036970a0d7e3dd3eb47a0d530/scipy-1.14.1-cp312-cp312-win_amd64.whl - sha256: 2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f + url: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl + sha256: c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 requires_dist: - numpy<2.3,>=1.23.5 - pytest ; extra == 'test' @@ -1605,8 +1727,8 @@ packages: - kind: pypi name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/c0/04/2bdacc8ac6387b15db6faa40295f8bd25eccf33f1f13e68a72dc3c60a99e/scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl - sha256: 631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d + url: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl + sha256: 2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 requires_dist: - numpy<2.3,>=1.23.5 - pytest ; extra == 'test' @@ -1647,8 +1769,8 @@ packages: - kind: pypi name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/c8/53/35b4d41f5fd42f5781dbd0dd6c05d35ba8aa75c84ecddc7d44756cd8da2e/scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl - sha256: af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 + url: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl + sha256: 716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 requires_dist: - numpy<2.3,>=1.23.5 - pytest ; extra == 'test' @@ -1686,6 +1808,23 @@ packages: - doit>=0.36.0 ; extra == 'dev' - pydevtool ; extra == 'dev' requires_python: '>=3.10' +- kind: conda + name: setuptools + version: 75.1.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda + sha256: 6725235722095c547edd24275053c615158d6163f396550840aebd6e209e4738 + md5: d5cd48392c67fb6849ba459c2c2b671f + depends: + - python >=3.8 + license: MIT + license_family: MIT + purls: + - pkg:pypi/setuptools?source=hash-mapping + size: 777462 + timestamp: 1727249510532 - kind: pypi name: six version: 1.16.0 @@ -1791,9 +1930,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: trove-classifiers - version: 2024.7.2 - url: https://files.pythonhosted.org/packages/0f/b0/09794439a62a7dc18bffdbf145aaf50297fd994890b11da27a13e376b947/trove_classifiers-2024.7.2-py3-none-any.whl - sha256: ccc57a33717644df4daca018e7ec3ef57a835c48e96a1e71fc07eb7edac67af6 + version: 2024.10.16 + url: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl + sha256: 9b02a4cb49bd2e85c13e728ee461f4f332d6334736b18d61254c964643687144 - kind: pypi name: types-cffi version: 1.16.0.20240331 @@ -1804,15 +1943,15 @@ packages: requires_python: '>=3.8' - kind: pypi name: types-pytz - version: 2024.1.0.20240417 - url: https://files.pythonhosted.org/packages/e8/8d/f5dc5239d59bb4a7b58e2b6d0dc6f2c2ba797b110f83cdda8479508c63dd/types_pytz-2024.1.0.20240417-py3-none-any.whl - sha256: 8335d443310e2db7b74e007414e74c4f53b67452c0cb0d228ca359ccfba59659 + version: 2024.2.0.20241003 + url: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl + sha256: 3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7 requires_python: '>=3.8' - kind: pypi name: types-setuptools - version: 71.1.0.20240806 - url: https://files.pythonhosted.org/packages/17/91/69c62223c0d6659414e9e126eee77902b83ac0444f92f475b84409953612/types_setuptools-71.1.0.20240806-py3-none-any.whl - sha256: 3bd8dd02039be0bb79ad880d8893b8eefcb022fabbeeb61245c61b20c9ab1ed0 + version: 75.2.0.20241019 + url: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl + sha256: 2e48ff3acd4919471e80d5e3f049cce5c177e108d5d36d2d4cee3fa4d4104258 requires_python: '>=3.8' - kind: pypi name: typing-extensions @@ -1822,73 +1961,73 @@ packages: requires_python: '>=3.8' - kind: conda name: tzdata - version: 2024a - build: h0c530f3_0 + version: 2024b + build: hc8b5060_0 subdir: noarch noarch: generic - url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - sha256: 7b2b69c54ec62a243eb6fba2391b5e443421608c3ae5dbff938ad33ca8db5122 - md5: 161081fc7cec0bfda0d86d7cb595f8d8 + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + md5: 8ac3367aafb1cc0a068483c580af8015 license: LicenseRef-Public-Domain purls: [] - size: 119815 - timestamp: 1706886945727 + size: 122354 + timestamp: 1728047496079 - kind: conda name: ucrt version: 10.0.22621.0 - build: h57928b3_0 + build: h57928b3_1 + build_number: 1 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2 - sha256: f29cdaf8712008f6b419b8b1a403923b00ab2504bfe0fb2ba8eb60e72d4f14c6 - md5: 72608f6cd3e5898229c3ea16deb1ac43 + url: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda + sha256: db8dead3dd30fb1a032737554ce91e2819b43496a0db09927edf01c32b577450 + md5: 6797b005cd0f439c4c5c9ac565783700 constrains: - vs2015_runtime >=14.29.30037 - license: LicenseRef-Proprietary - license_family: PROPRIETARY + license: LicenseRef-MicrosoftWindowsSDK10 purls: [] - size: 1283972 - timestamp: 1666630199266 + size: 559710 + timestamp: 1728377334097 - kind: conda name: vc version: '14.3' - build: h8a93ad2_20 - build_number: 20 + build: ha32ba9b_22 + build_number: 22 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h8a93ad2_20.conda - sha256: 23ac5feb15a9adf3ab2b8c4dcd63650f8b7ae860c5ceb073e49cf71d203eddef - md5: 8558f367e1d7700554f7cdb823c46faf + url: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda + sha256: 2a47c5bd8bec045959afada7063feacd074ad66b170c1ea92dd139b389fcf8fd + md5: 311c9ba1dfdd2895a8cb08346ff26259 depends: - - vc14_runtime >=14.40.33810 + - vc14_runtime >=14.38.33135 track_features: - vc14 license: BSD-3-Clause license_family: BSD purls: [] - size: 17391 - timestamp: 1717709040616 + size: 17447 + timestamp: 1728400826998 - kind: conda name: vc14_runtime version: 14.40.33810 - build: ha82c5b3_20 - build_number: 20 + build: hcc2c482_22 + build_number: 22 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda - sha256: af3cfa347e3d7c1277e9b964b0849a9a9f095bff61836cb3c3a89862fbc32e17 - md5: e39cc4c34c53654ec939558993d9dc5b + url: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda + sha256: 4c669c65007f88a7cdd560192f7e6d5679d191ac71610db724e18b2410964d64 + md5: ce23a4b980ee0556a118ed96550ff3f3 depends: - ucrt >=10.0.20348.0 constrains: - - vs2015_runtime 14.40.33810.* *_20 - license: LicenseRef-ProprietaryMicrosoft + - vs2015_runtime 14.40.33810.* *_22 + license: LicenseRef-MicrosoftVisualCpp2015-2022Runtime license_family: Proprietary purls: [] - size: 751934 - timestamp: 1717709031266 + size: 750719 + timestamp: 1728401055788 - kind: pypi name: virtualenv - version: 20.26.3 - url: https://files.pythonhosted.org/packages/07/4d/410156100224c5e2f0011d435e477b57aed9576fc7fe137abcf14ec16e11/virtualenv-20.26.3-py3-none-any.whl - sha256: 8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 + version: 20.27.0 + url: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl + sha256: 44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655 requires_dist: - distlib<1,>=0.3.7 - filelock<4,>=3.12.2 @@ -1913,23 +2052,23 @@ packages: - pytest>=7.4 ; extra == 'test' - setuptools>=68 ; extra == 'test' - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' - requires_python: '>=3.7' + requires_python: '>=3.8' - kind: conda name: vs2015_runtime version: 14.40.33810 - build: h3bf8584_20 - build_number: 20 + build: h3bf8584_22 + build_number: 22 subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda - sha256: 0c2803f7a788c51f28235a7228dc2ab3f107b4b16ab0845a3e595c8c51e50a7a - md5: c21f1b4a3a30bbc3ef35a50957578e0e + url: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda + sha256: 80aa9932203d65a96f817b8be4fafc176fb2b3fe6cf6899ede678b8f0317fbff + md5: 8c6b061d44cafdfc8e8c6eb5f100caf0 depends: - vc14_runtime >=14.40.33810 license: BSD-3-Clause license_family: BSD purls: [] - size: 17395 - timestamp: 1717709043353 + size: 17453 + timestamp: 1728400827536 - kind: pypi name: wcwidth version: 0.2.13 @@ -1937,6 +2076,23 @@ packages: sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 requires_dist: - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' +- kind: conda + name: wheel + version: 0.44.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda + sha256: d828764736babb4322b8102094de38074dedfc71f5ff405c9dfee89191c14ebc + md5: d44e3b085abcaef02983c6305b84b584 + depends: + - python >=3.8 + license: MIT + license_family: MIT + purls: + - pkg:pypi/wheel?source=hash-mapping + size: 58585 + timestamp: 1722797131787 - kind: conda name: xz version: 5.2.6 diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi index 2cc7fe8b4ac..8ec9a71bcd3 100644 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -2,7 +2,8 @@ import sys from collections.abc import Callable from io import IOBase -from os import PathLike + +from _typeshed import StrPath if sys.version_info >= (3, 11): from typing import Self @@ -12,6 +13,7 @@ if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias + from typing import Any, Literal, SupportsIndex, overload from pyarrow._stubs_typing import Compression, SupportPyBuffer @@ -61,8 +63,10 @@ class NativeFile(_Weakrefable): def read_buffer(self, nbytes: int | None = None) -> Buffer: ... def truncate(self) -> None: ... def writelines(self, lines: list[bytes]): ... - def download(self, stream_or_path: str | PathLike, buffer_size: int | None = None) -> None: ... - def upload(self, stream: str | PathLike, buffer_size: int | None) -> None: ... + def download( + self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None + ) -> None: ... + def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... # ---------------------------------------------------------------------- # Python file-like objects @@ -158,14 +162,14 @@ class BufferReader(NativeFile): class CompressedInputStream(NativeFile): def __init__( self, - stream: str | PathLike | NativeFile | IOBase, + stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], ) -> None: ... class CompressedOutputStream(NativeFile): def __init__( self, - stream: str | PathLike | NativeFile | IOBase, + stream: StrPath | NativeFile | IOBase, compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], ) -> None: ... @@ -222,7 +226,7 @@ class CacheOptions(_Weakrefable): class Codec(_Weakrefable): def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... @classmethod - def detect(cls, path: str | PathLike) -> Self: ... + def detect(cls, path: StrPath) -> Self: ... @staticmethod def is_available(compression: Compression) -> bool: ... @staticmethod @@ -337,12 +341,12 @@ def decompress( memory_pool: MemoryPool | None = None, ) -> bytes: ... def input_stream( - source: str | PathLike | Buffer | IOBase, + source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, ) -> BufferReader: ... def output_stream( - source: str | PathLike | Buffer | IOBase, + source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, ) -> NativeFile: ... diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 9c3bd94e364..c2f71110134 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -1,7 +1,8 @@ from dataclasses import dataclass, field -from pathlib import Path from typing import IO, Callable, Literal +from _typeshed import StrPath + from . import lib @dataclass(kw_only=True) @@ -66,7 +67,7 @@ class CSVWriter(lib._CRecordBatchWriter): def __init__( self, # TODO: OutputStream - sink: str | Path | IO, + sink: StrPath | IO, schema: lib.Schema, write_options: WriteOptions | None = None, *, @@ -78,14 +79,14 @@ class CSVStreamingReader(lib.RecordBatchReader): ... ISO8601: lib._Weakrefable def open_csv( - input_file: str | Path | IO, + input_file: StrPath | IO, read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> CSVStreamingReader: ... def read_csv( - input_file: str | Path | IO, + input_file: StrPath | IO, read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, @@ -93,7 +94,7 @@ def read_csv( ) -> lib.Table: ... def write_csv( data: lib.RecordBatch | lib.Table, - output_file: str | Path | lib.NativeFile | IO, + output_file: StrPath | lib.NativeFile | IO, write_options: WriteOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 3d1681ec941..6103a988aa7 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -1,7 +1,5 @@ import sys -from pathlib import Path - if sys.version_info >= (3, 11): from typing import Self else: @@ -18,6 +16,8 @@ from typing import ( overload, ) +from _typeshed import StrPath + from . import _csv, _json, _parquet, lib from ._fs import FileSelector, FileSystem, SupportedFileSystem from ._stubs_typing import Indices, JoinType, Order @@ -163,11 +163,11 @@ class FileWriteOptions(lib._Weakrefable): class FileFormat(lib._Weakrefable): def inspect( - self, file: str | Path | IO, filesystem: SupportedFileSystem | None = None + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None ) -> lib.Schema: ... def make_fragment( self, - file: str | Path | IO, + file: StrPath | IO, filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, *, @@ -507,7 +507,7 @@ class WrittenFile(lib._Weakrefable): def _filesystemdataset_write( data: Scanner, - base_dir: str | Path, + base_dir: StrPath, basename_template: str, filesystem: SupportedFileSystem, partitioning: Partitioning, diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index f5b2c93c7d7..2814fa8ed6f 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -1,7 +1,8 @@ from dataclasses import dataclass -from pathlib import Path from typing import IO, Any, Iterable, TypedDict +from _typeshed import StrPath + from ._compute import Expression from ._dataset import ( DatasetFactory, @@ -35,7 +36,7 @@ class ParquetFileFormat(FileFormat): def default_extname(self) -> str: ... def make_fragment( self, - file: IO | Path | str, + file: StrPath | IO, filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, row_groups: Iterable[int] | None = None, diff --git a/pyarrow-stubs/_feather.pyi b/pyarrow-stubs/_feather.pyi index 4abc96b55ae..8bb914ba45d 100644 --- a/pyarrow-stubs/_feather.pyi +++ b/pyarrow-stubs/_feather.pyi @@ -1,13 +1,14 @@ -from pathlib import Path from typing import IO +from _typeshed import StrPath + from .lib import Buffer, NativeFile, Table, _Weakrefable class FeatherError(Exception): ... def write_feather( table: Table, - dest: str | IO | Path | NativeFile, + dest: StrPath | IO | NativeFile, compression: str | None = None, compression_level: int | None = None, chunksize: int | None = None, @@ -17,7 +18,7 @@ def write_feather( class FeatherReader(_Weakrefable): def __init__( self, - source: str | IO | Path | NativeFile | Buffer, + source: StrPath | IO | NativeFile | Buffer, use_memory_map: bool, use_threads: bool, ) -> None: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 581b5096e9d..23ed3c27387 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -12,6 +12,7 @@ if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias + from typing import Union, overload from fsspec import AbstractFileSystem diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index 97af569908a..af8dc559501 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -1,4 +1,4 @@ -from pathlib import Path +from _typeshed import StrPath from ._fs import FileSystem @@ -12,7 +12,7 @@ class HadoopFileSystem(FileSystem): replication: int = 3, buffer_size: int = 0, default_block_size: int | None = None, - kerb_ticket: str | Path | None = None, + kerb_ticket: StrPath | None = None, extra_conf: dict | None = None, ): ... @staticmethod diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index 418f0ed8595..30329416731 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -1,6 +1,7 @@ -from pathlib import Path from typing import IO, Literal +from _typeshed import StrPath + from .lib import MemoryPool, Schema, Table, _Weakrefable class ReadOptions(_Weakrefable): @@ -22,7 +23,7 @@ class ParseOptions(_Weakrefable): def equals(self, other: ParseOptions) -> bool: ... def read_json( - input_file: str | Path | IO, + input_file: StrPath | IO, read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 7b7796dca12..5d187549c56 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -1,6 +1,7 @@ -from pathlib import Path from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict +from _typeshed import StrPath + from ._stubs_typing import Order from .lib import ( Buffer, @@ -273,7 +274,7 @@ class FileMetaData(_Weakrefable): def row_group(self, i: int) -> RowGroupMetaData: ... def set_file_path(self, path: str) -> None: ... def append_row_groups(self, other: FileMetaData) -> None: ... - def write_metadata_file(self, where: str | Path | Buffer | NativeFile | IO) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... class ParquetSchema(_Weakrefable): def __init__(self, container: FileMetaData) -> None: ... @@ -314,7 +315,7 @@ class ParquetReader(_Weakrefable): def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... def open( self, - source: str | Path | NativeFile | IO, + source: StrPath | NativeFile | IO, *, use_memory_map: bool = False, read_dictionary: Iterable[int] | Iterable[str] | None = None, diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 3473fe4dfce..4317b506ea6 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -1,6 +1,6 @@ -from pathlib import Path -from typing import Callable, Iterable, Literal, TypeAlias, overload +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload +from _typeshed import StrPath from pyarrow._dataset import ( CsvFileFormat, CsvFragmentScanOptions, @@ -151,7 +151,7 @@ def partitioning( dictionaries: dict[str, Array] | None = None, ) -> Partitioning: ... def parquet_dataset( - metadata_path: str | Path, + metadata_path: StrPath, schema: Schema | None = None, filesystem: SupportedFileSystem | None = None, format: ParquetFileFormat | None = None, @@ -160,7 +160,7 @@ def parquet_dataset( ) -> FileSystemDataset: ... @overload def dataset( - source: str | list[str] | Path | list[Path], + source: StrPath | Sequence[StrPath], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, filesystem: SupportedFileSystem | str | None = None, diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi index 361d726ae0b..9451ee15763 100644 --- a/pyarrow-stubs/feather.pyi +++ b/pyarrow-stubs/feather.pyi @@ -2,6 +2,7 @@ from typing import IO, Literal import pandas as pd +from _typeshed import StrPath from pyarrow._feather import FeatherError from pyarrow.lib import Table @@ -28,21 +29,21 @@ class FeatherDataset: def check_chunked_overflow(name: str, col) -> None: ... def write_feather( df: pd.DataFrame | Table, - dest: str, + dest: StrPath | IO, compression: Literal["zstd", "lz4", "uncompressed"] | None = None, compression_level: int | None = None, chunksize: int | None = None, version: Literal[1, 2] = 2, ) -> None: ... def read_feather( - source: str | IO, + source: StrPath | IO, columns: list[str] | None = None, use_threads: bool = True, memory_map: bool = False, **kwargs, ) -> pd.DataFrame: ... def read_table( - source: str | IO, + source: StrPath | IO, columns: list[str] | None = None, memory_map: bool = False, use_threads: bool = True, diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index 697100ecaf3..e128b5646f5 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -6,13 +6,15 @@ else: from typing_extensions import Self from typing import IO, Literal +from _typeshed import StrPath + from . import _orc from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: reader: _orc.ORCReader - def __init__(self, source: str | NativeFile | IO) -> None: ... + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... @property def metadata(self) -> KeyValueMetadata: ... @property @@ -55,7 +57,7 @@ class ORCWriter: is_open: bool def __init__( self, - where: str | NativeFile | IO, + where: StrPath | NativeFile | IO, *, file_version: str = "0.12", batch_size: int = 1024, @@ -75,7 +77,7 @@ class ORCWriter: def close(self) -> None: ... def read_table( - source: str | NativeFile | IO, + source: StrPath | NativeFile | IO, columns: list[str] | None = None, filesystem: SupportedFileSystem | None = None, ) -> Table: ... diff --git a/pyproject.toml b/pyproject.toml index 1b53c166102..bbfc8203d5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,10 +32,17 @@ requires = ["hatchling"] [tool.hatch.build.targets.wheel] packages = ["pyarrow-stubs"] +[tool.isort] +profile = "black" + [tool.pixi.project] channels = ["conda-forge"] platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] +[tool.pixi.dependencies] +python = "3.11" +pip = "*" + [tool.pixi.pypi-dependencies] pyarrow-stubs = { path = ".", editable = true } ipython = "*" @@ -46,6 +53,12 @@ ruff = ">=0.5" types-cffi = "*" pandas-stubs = "*" hatchling = "*" +fsspec = "*" +pyright = { version = ">=1.1.385,<2", extras = ["nodejs"] } + +[tool.pixi.tasks] +pyright = { cmd = "pyright" } +pre-commit = { cmd = "pre-commit" } [tool.ruff] fix = true From 1195f94dc4e69cfd7b66a6fc4c60190c5420e3b3 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Tue, 22 Oct 2024 21:54:15 -0400 Subject: [PATCH 109/231] fix type hint for sort_by (#130) sort_by takes str or list[tuple(name, order)] as its argument where str is a field name not a sort order --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- pyarrow-stubs/_dataset.pyi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 9c1fb169e3c..3c39cc204e1 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -434,7 +434,7 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def schema(self) -> Schema: ... @property def nbytes(self) -> int: ... - def sort_by(self, sorting: Order | list[tuple[str, Order]], **kwargs) -> Self: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... def take(self, indices: Indices) -> Self: ... def filter( self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 6103a988aa7..f7e801d87d3 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -101,7 +101,7 @@ class Dataset(lib._Weakrefable): @property def schema(self) -> lib.Schema: ... def filter(self, expression: Expression) -> Self: ... - def sort_by(self, sorting: Order | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... def join( self, right_dataset: Dataset, From 0e7eed7b5ea1ce5c25b823232ce953b9d01f004e Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Tue, 22 Oct 2024 21:57:04 -0400 Subject: [PATCH 110/231] metadata on a schema can be passed as str (#128) For details see https://github.com/apache/arrow/blob/apache-arrow-17.0.0/python/pyarrow/types.pxi\#L2053-L2056 --- pyarrow-stubs/__lib_pxi/types.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 0d3e6101795..6462d45dc0c 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -586,7 +586,7 @@ def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... def schema( fields: Iterable[Field] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], - metadata: dict[bytes, bytes] | None = None, + metadata: dict[bytes | str, bytes | str] | None = None, ) -> Schema: ... def from_numpy_dtype(dtype: np.dtype) -> DataType: ... def is_boolean_value(obj: Any) -> bool: ... From e4b3749ef09ebf7f79edca918b44ae8a297ca227 Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Wed, 23 Oct 2024 04:15:27 +0200 Subject: [PATCH 111/231] Correct typevars for DictionaryType, MapType, RunEncodedType (#126) Correct type hints for Dictionary, RunEndEncoded and Map Signed-off-by: Jonas Dedden Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/__lib_pxi/array.pyi | 35 ++++++++++++------------ pyarrow-stubs/__lib_pxi/scalar.pyi | 16 +++++------ pyarrow-stubs/__lib_pxi/types.pyi | 43 +++++++++++++++++++----------- 3 files changed, 51 insertions(+), 43 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 8a3f079f821..741eae99df1 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -48,6 +48,7 @@ from .types import ( MapType, _AsPyType, _BasicDataType, + _BasicValueT, _DataType_CoT, _DataTypeT, _IndexT, @@ -793,15 +794,15 @@ def nulls( @overload def nulls( size: int, - types: types.DictionaryType[_IndexT, _ValueT], + types: types.DictionaryType[_IndexT, _BasicValueT], memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _ValueT]: ... +) -> DictionaryArray[_IndexT, _BasicValueT]: ... @overload def nulls( size: int, - types: types.RunEndEncodedType[_RunEndType, _ValueT], + types: types.RunEndEncodedType[_RunEndType, _BasicValueT], memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _ValueT]: ... +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... @overload def nulls( size: int, @@ -996,16 +997,16 @@ def repeat( ) -> MapArray[_MapKeyT, _MapItemT]: ... @overload def repeat( - value: scalar.DictionaryScalar[_IndexT, _ValueT], + value: scalar.DictionaryScalar[_IndexT, _BasicValueT], size: int, memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _ValueT]: ... +) -> DictionaryArray[_IndexT, _BasicValueT]: ... @overload def repeat( - value: scalar.RunEndEncodedScalar[_RunEndType, _ValueT], + value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], size: int, memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _ValueT]: ... +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... @overload def repeat( value: scalar.UnionScalar, @@ -1448,16 +1449,16 @@ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... -class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _ValueT]]): +class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): @staticmethod def from_buffers( # type: ignore[override] - type: _ValueT, + type: _BasicValueT, length: int, buffers: list[Buffer], dictionary: Array | np.ndarray | pd.Series, null_count: int = -1, offset: int = 0, - ) -> DictionaryArray[Any, _ValueT]: ... + ) -> DictionaryArray[Any, _BasicValueT]: ... @staticmethod def from_arrays( indices: Indices, @@ -1482,28 +1483,28 @@ class StructArray(Array[scalar.StructScalar]): ) -> StructArray: ... def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: ... -class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _ValueT]]): +class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): @overload @staticmethod def from_arrays( run_ends: Int16Array, values: Array, type: _ValueT | None = None, - ) -> RunEndEncodedArray[types.Int16Type, _ValueT]: ... + ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... @overload @staticmethod def from_arrays( run_ends: Int32Array, values: Array, type: _ValueT | None = None, - ) -> RunEndEncodedArray[types.Int32Type, _ValueT]: ... + ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... @overload @staticmethod def from_arrays( run_ends: Int64Array, values: Array, type: _ValueT | None = None, - ) -> RunEndEncodedArray[types.Int64Type, _ValueT]: ... + ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... @staticmethod def from_buffers( # type: ignore[override] type: _ValueT, @@ -1512,11 +1513,11 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _ValueT]] null_count: int = -1, offset=0, children: tuple[Array, Array] | None = None, - ) -> RunEndEncodedArray[Any, _ValueT]: ... + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... @property def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: ... @property - def values(self) -> Array[scalar.Scalar[_ValueT]]: ... + def values(self) -> Array[scalar.Scalar[_BasicValueT]]: ... def find_physical_offset(self) -> int: ... def find_physical_length(self) -> int: ... diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 1a80e4c8747..f55b1832b27 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -69,9 +69,7 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): def as_py( self: Scalar[ types.ListType[ - types.DictionaryType[ - types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV], Any - ] + types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] ] ], ) -> list[dict[_AsPyTypeK, _AsPyTypeV]]: ... @@ -83,9 +81,7 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): ) -> list[dict[Any, _AsPyTypeV]]: ... @overload def as_py( - self: Scalar[ - types.ListType[types.DictionaryType[types._BasicDataType[_AsPyTypeK], Any, Any]], - ], + self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], ) -> list[dict[_AsPyTypeK, Any]]: ... @overload def as_py( @@ -234,17 +230,17 @@ class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... -class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._ValueT]]): +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): @property def index(self) -> Scalar[types._IndexT]: ... @property - def value(self) -> Scalar[types._ValueT]: ... + def value(self) -> Scalar[types._BasicValueT]: ... @property def dictionary(self) -> Array: ... -class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._ValueT]]): +class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): @property - def value(self) -> tuple[int, int] | None: ... + def value(self) -> tuple[int, types._BasicValueT] | None: ... class UnionScalar(Scalar[types.UnionType]): @property diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 6462d45dc0c..7501c1ce88e 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -139,19 +139,30 @@ class FixedSizeListType(ListType[_DataType_CoT], Generic[_DataType_CoT, _Size]): class DictionaryMemo(_Weakrefable): ... -_IndexT = TypeVar("_IndexT", bound=_BasicDataType) -_ValueT = TypeVar("_ValueT", bound=_BasicDataType) +_IndexT = TypeVar( + "_IndexT", + Uint8Type, + Int8Type, + Uint16Type, + Int16Type, + Uint32Type, + Int32Type, + Uint64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) _Ordered = TypeVar("_Ordered", bound=Literal[True, False], default=Literal[False]) -class DictionaryType(DataType, Generic[_IndexT, _ValueT, _Ordered]): +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): @property def ordered(self) -> _Ordered: ... @property def index_type(self) -> _IndexT: ... @property - def value_type(self) -> _ValueT: ... + def value_type(self) -> _BasicValueT: ... -_K = TypeVar("_K", bound=_BasicDataType) +_K = TypeVar("_K", bound=DataType) class MapType(DataType, Generic[_K, _ValueT, _Ordered]): @property @@ -195,11 +206,11 @@ class DenseUnionType(UnionType): _RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) -class RunEndEncodedType(DataType, Generic[_RunEndType, _ValueT]): +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): @property def run_end_type(self) -> _RunEndType: ... @property - def value_type(self) -> _ValueT: ... + def value_type(self) -> _BasicValueT: ... _StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) @@ -401,19 +412,19 @@ def large_list_view(value_type: Field[_DataTypeT]) -> LargeListViewType[_DataTyp @overload def large_list_view(value_type: _DataTypeT) -> LargeListViewType[_DataTypeT]: ... @overload -def map_(key_type: _K, item_type: _IndexT) -> MapType[_K, _IndexT, Literal[False]]: ... +def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, Literal[False]]: ... @overload def map_( - key_type: _K, item_type: _IndexT, key_sorted: _Ordered -) -> MapType[_K, _IndexT, _Ordered]: ... + key_type: _K, item_type: _ValueT, key_sorted: _Ordered +) -> MapType[_K, _ValueT, _Ordered]: ... @overload def dictionary( - index_type: _IndexT, value_type: _ValueT -) -> DictionaryType[_IndexT, _ValueT, Literal[False]]: ... + index_type: _IndexT, value_type: _BasicValueT +) -> DictionaryType[_IndexT, _BasicValueT, Literal[False]]: ... @overload def dictionary( - index_type: _IndexT, value_type: _ValueT, ordered: _Ordered -) -> DictionaryType[_IndexT, _ValueT, _Ordered]: ... + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... def struct( fields: Iterable[Field | tuple[str, Field]] | Mapping[str, Field], ) -> StructType: ... @@ -432,8 +443,8 @@ def union( child_fields: list[Field], mode: Literal["dense"], type_codes: list[int] | None = None ) -> DenseUnionType: ... def run_end_encoded( - run_end_type: _RunEndType, value_type: _ValueT -) -> RunEndEncodedType[_RunEndType, _ValueT]: ... + run_end_type: _RunEndType, value_type: _BasicValueT +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... def fixed_shape_tensor( value_type: _ValueT, shape: tuple[list[int], ...], From b6fb2fd680275d9507792f5323372df5cce4e8f3 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Wed, 23 Oct 2024 22:31:50 -0400 Subject: [PATCH 112/231] Add some more StrPath io parts that were overlooked. (#131) * Add some more StrPath io parts that were overlooked. Additionally, add the utility typealias `SingleOrList` that can be used in places where we want a concise type declaration but the there is a large union of types. * write_dataset(base_dir = ) can also take Path --- pyarrow-stubs/_parquet.pyi | 2 +- pyarrow-stubs/_stubs_typing.pyi | 5 ++++- pyarrow-stubs/dataset.pyi | 2 +- pyarrow-stubs/orc.pyi | 2 +- pyarrow-stubs/parquet/core.pyi | 7 ++++--- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 5d187549c56..03a6574a1e0 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -367,7 +367,7 @@ class ParquetReader(_Weakrefable): class ParquetWriter(_Weakrefable): def __init__( self, - where: str | NativeFile | IO, + where: StrPath | NativeFile | IO, schema: Schema, use_dictionary: bool | list[str] | None = None, compression: _Compression | dict[str, _Compression] | None = None, diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index 29946e88e42..8981dfa3c85 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -1,4 +1,4 @@ -from typing import Any, Collection, Literal, Protocol, TypeAlias +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar import numpy as np @@ -26,6 +26,9 @@ NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] Mask: TypeAlias = list[bool | None] | NDArray[np.bool_] | BooleanArray Indices: TypeAlias = list[int] | NDArray[np.integer] | IntegerArray +_T = TypeVar("_T") +SingleOrList: TypeAlias = list[_T] | _T + class SupportEq(Protocol): def __eq__(self, other) -> bool: ... diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 4317b506ea6..3a0197892c2 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -193,7 +193,7 @@ def dataset( ) -> InMemoryDataset: ... def write_dataset( data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], - base_dir: str, + base_dir: StrPath, *, basename_template: str | None = None, format: FileFormat | _DatasetFormat | None = None, diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index e128b5646f5..a4696a69297 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -83,7 +83,7 @@ def read_table( ) -> Table: ... def write_table( table: Table, - where: str | NativeFile | IO, + where: StrPath | NativeFile | IO, *, file_version: str = "0.12", batch_size: int = 1024, diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index d675fb4f916..b4c82eb26b5 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -13,6 +13,7 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias +from _typeshed import StrPath from pyarrow import _parquet from pyarrow._compute import Expression from pyarrow._fs import FileSystem, SupportedFileSystem @@ -29,7 +30,7 @@ from pyarrow._parquet import ( SortingColumn, Statistics, ) -from pyarrow._stubs_typing import FilterTuple +from pyarrow._stubs_typing import FilterTuple, SingleOrList from pyarrow.dataset import ParquetFileFragment, Partitioning from pyarrow.lib import NativeFile, RecordBatch, Schema, Table from typing_extensions import deprecated @@ -179,7 +180,7 @@ class ParquetWriter: class ParquetDataset: def __init__( self, - path_or_paths: str | list[str], + path_or_paths: SingleOrList[StrPath | NativeFile | IO], filesystem: SupportedFileSystem | None = None, schema: Schema | None = None, *, @@ -217,7 +218,7 @@ class ParquetDataset: def partitioning(self) -> Partitioning: ... def read_table( - source: str | Path | NativeFile | IO, + source: SingleOrList[StrPath | NativeFile | IO], *, columns: list | None = None, use_threads: bool = True, From 717e64b6f338aa12e684663ba5e854f3bf98e9d7 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Wed, 23 Oct 2024 22:32:30 -0400 Subject: [PATCH 113/231] Support ChunkedArray in add/append methods in Table (#129) --- pyarrow-stubs/__lib_pxi/table.pyi | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 3c39cc204e1..d6dbc9c0214 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -412,7 +412,7 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): @classmethod def from_pydict( cls, - mapping: Mapping[str, Array | list], + mapping: Mapping[str, ChunkedArray | Array | list | np.ndarray], schema: Schema | None = None, metadata: Mapping | None = None, ) -> Self: ... @@ -444,8 +444,10 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... def remove_column(self, i: int) -> Self: ... def drop_columns(self, columns: str | list[str]) -> Self: ... - def add_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... - def append_column(self, field_: str | Field, column: Array | list) -> Self: ... + def add_column( + self, i: int, field_: str | Field, column: ChunkedArray | Array | list + ) -> Self: ... + def append_column(self, field_: str | Field, column: ChunkedArray | Array | list) -> Self: ... class RecordBatch(_Tabular[Array]): def validate(self, *, full: bool = False) -> None: ... From 090ea66357b246dd2f3e0b60127ba972ecf30334 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Wed, 23 Oct 2024 22:35:05 -0400 Subject: [PATCH 114/231] Add missing partitioning typing case (#132) This should now support the examples in the docstring for partitioning. --- pyarrow-stubs/_dataset.pyi | 6 ++++++ pyarrow-stubs/dataset.pyi | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index f7e801d87d3..21684e6fe8d 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -351,6 +351,12 @@ class DirectoryPartitioning(KeyValuePartitioning): schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", ) -> PartitioningFactory: ... + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... class HivePartitioning(KeyValuePartitioning): def __init__( diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 3a0197892c2..e9da4ec22b1 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -111,6 +111,10 @@ __all__ = [ _DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... @overload def partitioning( schema: Schema, From e02cdb828a157b6a90e9a51c9aa50894ec9838b3 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 24 Oct 2024 10:44:06 +0800 Subject: [PATCH 115/231] fix: typo 'permissive' instead of 'premissive' (#133) --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index d6dbc9c0214..b1389abb95c 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -624,7 +624,7 @@ def table( def concat_tables( tables: list[Table], memory_pool: MemoryPool | None = None, - promote_options: Literal["none", "default", "premissive"] = "none", + promote_options: Literal["none", "default", "permissive"] = "none", **kwargs, ) -> Table: ... From f7f798ef08767515b8d1d9b2614625d04cb14cc0 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 24 Oct 2024 10:47:07 +0800 Subject: [PATCH 116/231] release 17.10 (#134) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 65bfb0e61dc..fc2909b2f64 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1408,9 +1408,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.9' + version: '17.10' path: . - sha256: 7a6c58e69d86f0d33726eec03e3c168c9bf7cabbef3a20cd64419de2f7489a69 + sha256: c8cbcc028764f8ec801ad818c50d8ba98d4e77c2b766a7c0fa058b2dd5c55904 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index bbfc8203d5f..9820513d0b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.9" +version = "17.10" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 61d316595e74d9952581abe214cb0084b31b42a6 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Thu, 24 Oct 2024 22:16:56 -0400 Subject: [PATCH 117/231] fix incorrect type hints for compute.sort_indices (#135) --- pyarrow-stubs/compute.pyi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index af4c92d3ff6..5cb73bee154 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1,6 +1,6 @@ # mypy: disable-error-code="misc,type-var,var-annotated" # ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence from collections.abc import Callable # Option classes @@ -2547,9 +2547,9 @@ def select_k_unstable( ) -> Expression: ... @overload def sort_indices( - array: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, /, - order: Literal["ascending", "descending"] = "ascending", + sort_keys: Sequence[tuple[str, Literal["ascending", "descending"]]], *, null_placement: Literal["at_start", "at_end"] = "at_end", options: SortOptions | None = None, @@ -2557,9 +2557,9 @@ def sort_indices( ) -> lib.UInt64Array: ... @overload def sort_indices( - array: Expression, + input: Expression, /, - order: Literal["ascending", "descending"] = "ascending", + sort_keys: Sequence[tuple[str, Literal["ascending", "descending"]]], *, null_placement: Literal["at_start", "at_end"] = "at_end", options: SortOptions | None = None, From 359a7bf21685eb65e083db1bbaba8914390b2b8b Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 28 Oct 2024 22:02:37 -0400 Subject: [PATCH 118/231] disallow passing `names` as an argument to table when using dictionaries (#137) --- pyarrow-stubs/__lib_pxi/table.pyi | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index b1389abb95c..4a297092ca8 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -609,9 +609,16 @@ def record_batch( schema: Schema | None = None, metadata: Mapping | None = None, ) -> RecordBatch: ... +@overload def table( - data: dict[str, list | Array] - | list[Array | ChunkedArray] + data: dict[str, list | Array], + schema: Schema | None = None, + metadata: Mapping | None = None, + nthreads: int | None = None, +) -> Table: ... +@overload +def table( + data: list[Array | ChunkedArray] | pd.DataFrame | SupportArrowArray | SupportArrowStream From 5e97fc364e0d53fd489afd2ce732c69a73955363 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:03:02 +0800 Subject: [PATCH 119/231] [pre-commit.ci] pre-commit autoupdate (#138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.0 → v0.7.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.0...v0.7.1) - [github.com/pre-commit/mirrors-mypy: v1.12.1 → v1.13.0](https://github.com/pre-commit/mirrors-mypy/compare/v1.12.1...v1.13.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8817432dc65..bf57674a404 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,13 +19,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.0 + rev: v0.7.1 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.12.1 + rev: v1.13.0 hooks: - id: mypy From 98a01b9170450162617afba06187c198bfa9d03d Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Mon, 28 Oct 2024 22:03:59 -0400 Subject: [PATCH 120/231] Add missing type for FlightEndpoint (#136) --- pyarrow-stubs/_flight.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 89a8952cab7..5e23745e70c 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -145,7 +145,7 @@ class Location(_Weakrefable): def for_grpc_unix(path: str | bytes) -> Location: ... class FlightEndpoint(_Weakrefable): - def __init__(self, ticket: Ticket | str | bytes, locations: list[str]): ... + def __init__(self, ticket: Ticket | str | bytes, locations: list[str | Location]): ... @property def ticket(self) -> Ticket: ... @property From 42498a02f2637fcf066d9fb62d477c7fa5371fcf Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 29 Oct 2024 10:05:48 +0800 Subject: [PATCH 121/231] release 17.11 (#139) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index fc2909b2f64..6b2c6c0b842 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1408,9 +1408,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.10' + version: '17.11' path: . - sha256: c8cbcc028764f8ec801ad818c50d8ba98d4e77c2b766a7c0fa058b2dd5c55904 + sha256: da130b39ec1fba1326b5d45ae583b14497780ac8d6b5f7d0d3901d44262cb6df requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 9820513d0b3..64cfc76e855 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.10" +version = "17.11" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 4e6deaa74227a59210c9e2b25482e9d139216b4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:31:13 +0800 Subject: [PATCH 122/231] [pre-commit.ci] pre-commit autoupdate (#140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.1 → v0.7.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.1...v0.7.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf57674a404..17d57dfba6e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.1 + rev: v0.7.2 hooks: - id: ruff args: [--fix] From 7a4d8364febc9b73431cf13fe096d2adeab1c0f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:53:27 +0800 Subject: [PATCH 123/231] [pre-commit.ci] pre-commit autoupdate (#142) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.2 → v0.7.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.2...v0.7.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 17d57dfba6e..c1ed407f310 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.7.3 hooks: - id: ruff args: [--fix] From 372e5bd397bf806bf8bca6e08908296ed897e417 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 12 Nov 2024 13:54:23 +0800 Subject: [PATCH 124/231] chore: Create FUNDING.yml (#143) Create FUNDING.yml --- .github/FUNDING.yml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000000..e78fdc9b020 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: [zen-xu] From b1ca580a693a5e4bb201b255cb27fe0a79b60aaf Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 13 Nov 2024 10:33:44 +0800 Subject: [PATCH 125/231] fix: `read_schema` should return Schema (#145) fix: read_schema should return Schema --- pyarrow-stubs/parquet/core.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index b4c82eb26b5..de2716a391b 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -304,4 +304,4 @@ def read_schema( memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, -) -> FileMetaData: ... +) -> Schema: ... From c4952305d6c78cadb9bf4e04c6e2ac6b0ef8be75 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 13 Nov 2024 10:38:41 +0800 Subject: [PATCH 126/231] release 17.12 (#146) --- pixi.lock | 48 ++++++++++++++++++++++++------------------------ pyproject.toml | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/pixi.lock b/pixi.lock index 6b2c6c0b842..cf4a327d474 100644 --- a/pixi.lock +++ b/pixi.lock @@ -299,11 +299,11 @@ packages: requires_dist: - six>=1.12.0 - typing ; python_full_version < '3.5' - - astroid<2,>=1 ; python_full_version < '3' and extra == 'astroid' - - astroid<4,>=2 ; python_full_version >= '3' and extra == 'astroid' + - astroid>=1,<2 ; python_full_version < '3' and extra == 'astroid' + - astroid>=2,<4 ; python_full_version >= '3' and extra == 'astroid' - pytest ; extra == 'test' - - astroid<2,>=1 ; python_full_version < '3' and extra == 'test' - - astroid<4,>=2 ; python_full_version >= '3' and extra == 'test' + - astroid>=1,<2 ; python_full_version < '3' and extra == 'test' + - astroid>=2,<4 ; python_full_version >= '3' and extra == 'test' - kind: conda name: bzip2 version: 1.0.8 @@ -430,7 +430,7 @@ packages: version: 0.4.6 url: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl sha256: 4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 - requires_python: '!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7' + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' - kind: pypi name: decorator version: 5.1.1 @@ -539,10 +539,10 @@ packages: - pytest-recording ; extra == 'test' - pytest-rerunfailures ; extra == 'test' - requests ; extra == 'test' - - aiobotocore<3.0.0,>=2.5.4 ; extra == 'test-downstream' + - aiobotocore>=2.5.4,<3.0.0 ; extra == 'test-downstream' - dask-expr ; extra == 'test-downstream' - dask[dataframe,test] ; extra == 'test-downstream' - - moto[server]<5,>4 ; extra == 'test-downstream' + - moto[server]>4,<5 ; extra == 'test-downstream' - pytest-timeout ; extra == 'test-downstream' - xarray ; extra == 'test-downstream' - adlfs ; extra == 'test-full' @@ -614,7 +614,7 @@ packages: - decorator - jedi>=0.16 - matplotlib-inline - - prompt-toolkit<3.1.0,>=3.0.41 + - prompt-toolkit>=3.0.41,<3.1.0 - pygments>=2.4.0 - stack-data - traitlets>=5.13.0 @@ -664,7 +664,7 @@ packages: url: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl sha256: e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 requires_dist: - - parso<0.9.0,>=0.8.3 + - parso>=0.8.3,<0.9.0 - jinja2==2.11.3 ; extra == 'docs' - markupsafe==1.1.1 ; extra == 'docs' - pygments==2.8.1 ; extra == 'docs' @@ -1408,9 +1408,9 @@ packages: requires_python: '>=3.8' - kind: pypi name: pyarrow-stubs - version: '17.11' + version: '17.12' path: . - sha256: da130b39ec1fba1326b5d45ae583b14497780ac8d6b5f7d0d3901d44262cb6df + sha256: 0438bdb5aab19874198fda9e04ff0fb7dacbe7f93164168be33a8331083bb05e requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' @@ -1646,7 +1646,7 @@ packages: url: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 requires_dist: - - numpy<2.3,>=1.23.5 + - numpy>=1.23.5,<2.3 - pytest ; extra == 'test' - pytest-cov ; extra == 'test' - pytest-timeout ; extra == 'test' @@ -1662,7 +1662,7 @@ packages: - cython ; extra == 'test' - meson ; extra == 'test' - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - sphinx-design>=0.4.0 ; extra == 'doc' - matplotlib>=3.5 ; extra == 'doc' @@ -1688,7 +1688,7 @@ packages: url: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl sha256: c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 requires_dist: - - numpy<2.3,>=1.23.5 + - numpy>=1.23.5,<2.3 - pytest ; extra == 'test' - pytest-cov ; extra == 'test' - pytest-timeout ; extra == 'test' @@ -1704,7 +1704,7 @@ packages: - cython ; extra == 'test' - meson ; extra == 'test' - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - sphinx-design>=0.4.0 ; extra == 'doc' - matplotlib>=3.5 ; extra == 'doc' @@ -1730,7 +1730,7 @@ packages: url: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl sha256: 2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 requires_dist: - - numpy<2.3,>=1.23.5 + - numpy>=1.23.5,<2.3 - pytest ; extra == 'test' - pytest-cov ; extra == 'test' - pytest-timeout ; extra == 'test' @@ -1746,7 +1746,7 @@ packages: - cython ; extra == 'test' - meson ; extra == 'test' - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - sphinx-design>=0.4.0 ; extra == 'doc' - matplotlib>=3.5 ; extra == 'doc' @@ -1772,7 +1772,7 @@ packages: url: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl sha256: 716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 requires_dist: - - numpy<2.3,>=1.23.5 + - numpy>=1.23.5,<2.3 - pytest ; extra == 'test' - pytest-cov ; extra == 'test' - pytest-timeout ; extra == 'test' @@ -1788,7 +1788,7 @@ packages: - cython ; extra == 'test' - meson ; extra == 'test' - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx<=7.3.7,>=5.0.0 ; extra == 'doc' + - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - sphinx-design>=0.4.0 ; extra == 'doc' - matplotlib>=3.5 ; extra == 'doc' @@ -1926,7 +1926,7 @@ packages: - pre-commit ; extra == 'test' - pytest-mock ; extra == 'test' - pytest-mypy-testing ; extra == 'test' - - pytest<8.2,>=7.0 ; extra == 'test' + - pytest>=7.0,<8.2 ; extra == 'test' requires_python: '>=3.8' - kind: pypi name: trove-classifiers @@ -2029,13 +2029,13 @@ packages: url: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl sha256: 44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655 requires_dist: - - distlib<1,>=0.3.7 - - filelock<4,>=3.12.2 + - distlib>=0.3.7,<1 + - filelock>=3.12.2,<4 - importlib-metadata>=6.6 ; python_full_version < '3.8' - - platformdirs<5,>=3.9.1 + - platformdirs>=3.9.1,<5 - furo>=2023.7.26 ; extra == 'docs' - proselint>=0.13 ; extra == 'docs' - - sphinx!=7.3,>=7.1.2 ; extra == 'docs' + - sphinx>=7.1.2,!=7.3 ; extra == 'docs' - sphinx-argparse>=0.4 ; extra == 'docs' - sphinxcontrib-towncrier>=0.2.1a0 ; extra == 'docs' - towncrier>=23.6 ; extra == 'docs' diff --git a/pyproject.toml b/pyproject.toml index 64cfc76e855..feaeae596be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.11" +version = "17.12" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 16452984d44d84baab168e30ccc0666cdbccc7a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:25:45 +0800 Subject: [PATCH 127/231] [pre-commit.ci] pre-commit autoupdate (#147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.3 → v0.7.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.3...v0.7.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c1ed407f310..5cd92f5c76b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.3 + rev: v0.7.4 hooks: - id: ruff args: [--fix] From 0087319b37eeef3aad30400301fca58f11cd8371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20D=2E=20Batista?= Date: Wed, 4 Dec 2024 02:45:00 -0300 Subject: [PATCH 128/231] fix: `to_table` argument `columns` can be a dict of expressions (#149) --- pyarrow-stubs/_dataset.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 21684e6fe8d..b6828593783 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -54,7 +54,7 @@ class Dataset(lib._Weakrefable): ) -> Iterator[lib.RecordBatch]: ... def to_table( self, - columns: list[str] | None = None, + columns: list[str] | dict[str, Expression] | None = None, filter: Expression | None = None, batch_size: int = ..., batch_readahead: int = 16, From e89dd28f0b840ae0edd76967e346b534dbbf1e85 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 02:42:59 +0000 Subject: [PATCH 129/231] [pre-commit.ci] pre-commit autoupdate (#148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.4 → v0.8.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.4...v0.8.1) * ruff: ignore PYI063 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ZhengYu, Xu --- .pre-commit-config.yaml | 2 +- pixi.lock | 970 ++++++++++++---------------------------- pyproject.toml | 3 +- 3 files changed, 292 insertions(+), 683 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5cd92f5c76b..acd19e2ae14 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.4 + rev: v0.8.1 hooks: - id: ruff args: [--fix] diff --git a/pixi.lock b/pixi.lock index cf4a327d474..8a02a27c0b0 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1,4 +1,4 @@ -version: 5 +version: 6 environments: default: channels: @@ -260,25 +260,15 @@ environments: - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - pypi: . packages: -- kind: conda - name: _libgcc_mutex - version: '0.1' - build: conda_forge - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 md5: d7c89558ba9fa0495403155b64376d81 license: None purls: [] size: 2562 timestamp: 1578324546067 -- kind: conda - name: _openmp_mutex - version: '4.5' - build: 2_gnu +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 build_number: 16 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 md5: 73aaf86a425cc6e73fcf236a5a46396d depends: @@ -291,10 +281,9 @@ packages: purls: [] size: 23621 timestamp: 1650670423406 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl name: asttokens version: 2.4.1 - url: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 requires_dist: - six>=1.12.0 @@ -304,31 +293,7 @@ packages: - pytest ; extra == 'test' - astroid>=1,<2 ; python_full_version < '3' and extra == 'test' - astroid>=2,<4 ; python_full_version >= '3' and extra == 'test' -- kind: conda - name: bzip2 - version: 1.0.8 - build: h2466b09_7 - build_number: 7 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - sha256: 35a5dad92e88fdd7fc405e864ec239486f4f31eec229e31686e61a140a8e573b - md5: 276e7ffe9ffe39688abc665ef0f45596 - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 54927 - timestamp: 1720974860185 -- kind: conda - name: bzip2 - version: 1.0.8 - build: h4bc722e_7 - build_number: 7 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d md5: 62ee74e96c5ebb0af99386de58cf9553 depends: @@ -339,13 +304,17 @@ packages: purls: [] size: 252783 timestamp: 1720974456583 -- kind: conda - name: bzip2 - version: 1.0.8 - build: h99b78c6_7 - build_number: 7 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda +- conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda + sha256: cad153608b81fb24fc8c509357daa9ae4e49dfc535b2cb49b91e23dbd68fc3c5 + md5: 7ed4301d437b59045be7e051a0308211 + depends: + - __osx >=10.13 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 134188 + timestamp: 1720974491916 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda sha256: adfa71f158cbd872a36394c56c3568e6034aa55c623634b37a4836bd036e6b91 md5: fc6948412dbbbe9a4c9ddbbcfe0a79ab depends: @@ -355,97 +324,68 @@ packages: purls: [] size: 122909 timestamp: 1720974522888 -- kind: conda - name: bzip2 - version: 1.0.8 - build: hfdf4475_7 - build_number: 7 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - sha256: cad153608b81fb24fc8c509357daa9ae4e49dfc535b2cb49b91e23dbd68fc3c5 - md5: 7ed4301d437b59045be7e051a0308211 +- conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda + sha256: 35a5dad92e88fdd7fc405e864ec239486f4f31eec229e31686e61a140a8e573b + md5: 276e7ffe9ffe39688abc665ef0f45596 depends: - - __osx >=10.13 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 license: bzip2-1.0.6 license_family: BSD purls: [] - size: 134188 - timestamp: 1720974491916 -- kind: conda - name: ca-certificates - version: 2024.8.30 - build: h56e8100_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda - sha256: 0fcac3a7ffcc556649e034a1802aedf795e64227eaa7194d207b01eaf26454c4 - md5: 4c4fd67c18619be5aa65dc5b6c72e490 + size: 54927 + timestamp: 1720974860185 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda + sha256: afee721baa6d988e27fef1832f68d6f32ac8cc99cdf6015732224c2841a09cea + md5: c27d1c142233b5bc9ca570c6e2e0c244 license: ISC purls: [] - size: 158773 - timestamp: 1725019107649 -- kind: conda - name: ca-certificates - version: 2024.8.30 - build: h8857fd0_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda + size: 159003 + timestamp: 1725018903918 +- conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda sha256: 593f302d0f44c2c771e1614ee6d56fffdc7d616e6f187669c8b0e34ffce3e1ae md5: b7e5424e7f06547a903d28e4651dbb21 license: ISC purls: [] size: 158665 timestamp: 1725019059295 -- kind: conda - name: ca-certificates - version: 2024.8.30 - build: hbcca054_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - sha256: afee721baa6d988e27fef1832f68d6f32ac8cc99cdf6015732224c2841a09cea - md5: c27d1c142233b5bc9ca570c6e2e0c244 - license: ISC - purls: [] - size: 159003 - timestamp: 1725018903918 -- kind: conda - name: ca-certificates - version: 2024.8.30 - build: hf0a4a13_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda sha256: 2db1733f4b644575dbbdd7994a8f338e6ef937f5ebdb74acd557e9dda0211709 md5: 40dec13fd8348dbe303e57be74bd3d35 license: ISC purls: [] size: 158482 timestamp: 1725019034582 -- kind: pypi +- conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda + sha256: 0fcac3a7ffcc556649e034a1802aedf795e64227eaa7194d207b01eaf26454c4 + md5: 4c4fd67c18619be5aa65dc5b6c72e490 + license: ISC + purls: [] + size: 158773 + timestamp: 1725019107649 +- pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl name: cfgv version: 3.4.0 - url: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl sha256: b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl name: colorama version: 0.4.6 - url: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl sha256: 4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl name: decorator version: 5.1.1 - url: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl sha256: b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 requires_python: '>=3.5' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl name: distlib version: 0.3.9 - url: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl sha256: 47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl name: executing version: 2.1.0 - url: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl sha256: 8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf requires_dist: - asttokens>=2.1.0 ; extra == 'tests' @@ -456,10 +396,9 @@ packages: - littleutils ; extra == 'tests' - rich ; python_full_version >= '3.11' and extra == 'tests' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl name: filelock version: 3.16.1 - url: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl sha256: 2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 requires_dist: - furo>=2024.8.6 ; extra == 'docs' @@ -476,10 +415,9 @@ packages: - virtualenv>=20.26.4 ; extra == 'testing' - typing-extensions>=4.12.2 ; python_full_version < '3.11' and extra == 'typing' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl name: fsspec version: 2024.10.0 - url: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl sha256: 03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 requires_dist: - adlfs ; extra == 'abfs' @@ -585,10 +523,9 @@ packages: - zstandard ; extra == 'test-full' - tqdm ; extra == 'tqdm' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl name: hatchling version: 1.25.0 - url: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl sha256: b47948e45d4d973034584dd4cb39c14b6a70227cf287ab7ec0ad7983408a882c requires_dist: - packaging>=23.2 @@ -597,18 +534,16 @@ packages: - tomli>=1.2.2 ; python_full_version < '3.11' - trove-classifiers requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl name: identify version: 2.6.1 - url: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl sha256: 53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0 requires_dist: - ukkonen ; extra == 'license' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl name: ipython version: 8.28.0 - url: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl sha256: 530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35 requires_dist: - decorator @@ -658,10 +593,9 @@ packages: - pandas ; extra == 'test-extra' - trio ; extra == 'test-extra' requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl name: jedi version: 0.19.1 - url: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl sha256: e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 requires_dist: - parso>=0.8.3,<0.9.0 @@ -699,13 +633,7 @@ packages: - docopt ; extra == 'testing' - pytest<7.0.0 ; extra == 'testing' requires_python: '>=3.6' -- kind: conda - name: ld_impl_linux-64 - version: '2.43' - build: h712a8e2_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda sha256: 0c21387f9a411e3d1f7f2969026bacfece133c8f1e72faea9cde29c0c19e1f3a md5: 83e1364586ceb8d0739fbc85b5c95837 depends: @@ -717,13 +645,17 @@ packages: purls: [] size: 669616 timestamp: 1727304687962 -- kind: conda - name: libffi - version: 3.4.2 - build: h0d85af4_5 - build_number: 5 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 + sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e + md5: d645c6d2ac96843a2bfaccd2d62b3ac3 + depends: + - libgcc-ng >=9.4.0 + license: MIT + license_family: MIT + purls: [] + size: 58292 + timestamp: 1636488182923 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 sha256: 7a2d27a936ceee6942ea4d397f9c7d136f12549d86f7617e8b6bad51e01a941f md5: ccb34fb14960ad8b125962d3d79b31a9 license: MIT @@ -731,13 +663,7 @@ packages: purls: [] size: 51348 timestamp: 1636488394370 -- kind: conda - name: libffi - version: 3.4.2 - build: h3422bc3_5 - build_number: 5 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 sha256: 41b3d13efb775e340e4dba549ab5c029611ea6918703096b2eaa9c015c0750ca md5: 086914b672be056eb70fd4285b6783b6 license: MIT @@ -745,29 +671,7 @@ packages: purls: [] size: 39020 timestamp: 1636488587153 -- kind: conda - name: libffi - version: 3.4.2 - build: h7f98852_5 - build_number: 5 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 - sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e - md5: d645c6d2ac96843a2bfaccd2d62b3ac3 - depends: - - libgcc-ng >=9.4.0 - license: MIT - license_family: MIT - purls: [] - size: 58292 - timestamp: 1636488182923 -- kind: conda - name: libffi - version: 3.4.2 - build: h8ffe710_5 - build_number: 5 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 sha256: 1951ab740f80660e9bc07d2ed3aefb874d78c107264fd810f24a1a6211d4b1a5 md5: 2c96d1b6915b408893f9472569dee135 depends: @@ -778,13 +682,7 @@ packages: purls: [] size: 42063 timestamp: 1636489106777 -- kind: conda - name: libgcc - version: 14.2.0 - build: h77fa898_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda sha256: 53eb8a79365e58849e7b1a068d31f4f9e718dc938d6f2c03e960345739a03569 md5: 3cb76c3f10d3bc7f1105b2fc9db984df depends: @@ -798,13 +696,7 @@ packages: purls: [] size: 848745 timestamp: 1729027721139 -- kind: conda - name: libgcc-ng - version: 14.2.0 - build: h69a702a_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda sha256: 3a76969c80e9af8b6e7a55090088bc41da4cffcde9e2c71b17f44d37b7cb87f7 md5: e39480b9ca41323497b05492a63bc35b depends: @@ -814,13 +706,7 @@ packages: purls: [] size: 54142 timestamp: 1729027726517 -- kind: conda - name: libgomp - version: 14.2.0 - build: h77fa898_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda sha256: 1911c29975ec99b6b906904040c855772ccb265a1c79d5d75c8ceec4ed89cd63 md5: cc3573974587f12dda90d96e3e55a702 depends: @@ -830,12 +716,7 @@ packages: purls: [] size: 460992 timestamp: 1729027639220 -- kind: conda - name: libnsl - version: 2.0.1 - build: hd590300_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6 md5: 30fd6e37fe21f86f4bd26d6ee73eeec7 depends: @@ -845,28 +726,18 @@ packages: purls: [] size: 33408 timestamp: 1697359010159 -- kind: conda - name: libsqlite - version: 3.46.1 - build: h2466b09_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda - sha256: ef83f90961630bc54a95e48062b05cf9c9173a822ea01784288029613a45eea4 - md5: 8a7c1ad01f58623bfbae8d601db7cf3b +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda + sha256: 9851c049abafed3ee329d6c7c2033407e2fc269d33a75c071110ab52300002b0 + md5: 36f79405ab16bf271edb55b213836dac depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 876666 - timestamp: 1725354171439 -- kind: conda - name: libsqlite - version: 3.46.1 - build: h4b8f8c9_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda + size: 865214 + timestamp: 1725353659783 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda sha256: 1d075cb823f0cad7e196871b7c57961d669cbbb6cd0e798bf50cbf520dda65fb md5: 84de0078b58f899fc164303b0603ff0e depends: @@ -876,28 +747,7 @@ packages: purls: [] size: 908317 timestamp: 1725353652135 -- kind: conda - name: libsqlite - version: 3.46.1 - build: hadc24fc_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda - sha256: 9851c049abafed3ee329d6c7c2033407e2fc269d33a75c071110ab52300002b0 - md5: 36f79405ab16bf271edb55b213836dac - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - libzlib >=1.3.1,<2.0a0 - license: Unlicense - purls: [] - size: 865214 - timestamp: 1725353659783 -- kind: conda - name: libsqlite - version: 3.46.1 - build: hc14010f_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda sha256: 3725f962f490c5d44dae326d5f5b2e3c97f71a6322d914ccc85b5ddc2e50d120 md5: 58050ec1724e58668d0126a1615553fa depends: @@ -907,12 +757,18 @@ packages: purls: [] size: 829500 timestamp: 1725353720793 -- kind: conda - name: libuuid - version: 2.38.1 - build: h0b41bf4_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda + sha256: ef83f90961630bc54a95e48062b05cf9c9173a822ea01784288029613a45eea4 + md5: 8a7c1ad01f58623bfbae8d601db7cf3b + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: Unlicense + purls: [] + size: 876666 + timestamp: 1725354171439 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 md5: 40b61aab5c7ba9ff276c41cfffe6b80b depends: @@ -922,93 +778,67 @@ packages: purls: [] size: 33601 timestamp: 1680112270483 -- kind: conda - name: libzlib - version: 1.3.1 - build: h2466b09_2 - build_number: 2 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda - sha256: ba945c6493449bed0e6e29883c4943817f7c79cbff52b83360f7b341277c6402 - md5: 41fbfac52c601159df6c01f875de31b9 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + md5: edb0dca6bc32e4f4789199455a1dbeb8 depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 constrains: - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 55476 - timestamp: 1727963768015 -- kind: conda - name: libzlib - version: 1.3.1 - build: h8359307_2 - build_number: 2 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda - sha256: ce34669eadaba351cd54910743e6a2261b67009624dbc7daeeafdef93616711b - md5: 369964e85dc26bfe78f41399b366c435 + size: 60963 + timestamp: 1727963148474 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda + sha256: 8412f96504fc5993a63edf1e211d042a1fd5b1d51dedec755d2058948fcced09 + md5: 003a54a4e32b02f7355b50a837e699da depends: - - __osx >=11.0 + - __osx >=10.13 constrains: - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 46438 - timestamp: 1727963202283 -- kind: conda - name: libzlib - version: 1.3.1 - build: hb9d3cd8_2 - build_number: 2 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 - md5: edb0dca6bc32e4f4789199455a1dbeb8 + size: 57133 + timestamp: 1727963183990 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda + sha256: ce34669eadaba351cd54910743e6a2261b67009624dbc7daeeafdef93616711b + md5: 369964e85dc26bfe78f41399b366c435 depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 + - __osx >=11.0 constrains: - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 60963 - timestamp: 1727963148474 -- kind: conda - name: libzlib - version: 1.3.1 - build: hd23fc13_2 - build_number: 2 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda - sha256: 8412f96504fc5993a63edf1e211d042a1fd5b1d51dedec755d2058948fcced09 - md5: 003a54a4e32b02f7355b50a837e699da + size: 46438 + timestamp: 1727963202283 +- conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda + sha256: ba945c6493449bed0e6e29883c4943817f7c79cbff52b83360f7b341277c6402 + md5: 41fbfac52c601159df6c01f875de31b9 depends: - - __osx >=10.13 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 constrains: - zlib 1.3.1 *_2 license: Zlib license_family: Other purls: [] - size: 57133 - timestamp: 1727963183990 -- kind: pypi + size: 55476 + timestamp: 1727963768015 +- pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl name: matplotlib-inline version: 0.1.7 - url: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl sha256: df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca requires_dist: - traitlets requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl name: mypy version: 1.12.1 - url: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl sha256: 1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735 requires_dist: - typing-extensions>=4.6.0 @@ -1019,10 +849,9 @@ packages: - setuptools>=50 ; extra == 'mypyc' - lxml ; extra == 'reports' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl name: mypy version: 1.12.1 - url: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl sha256: a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6 requires_dist: - typing-extensions>=4.6.0 @@ -1033,10 +862,9 @@ packages: - setuptools>=50 ; extra == 'mypyc' - lxml ; extra == 'reports' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl name: mypy version: 1.12.1 - url: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl sha256: 673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811 requires_dist: - typing-extensions>=4.6.0 @@ -1047,10 +875,9 @@ packages: - setuptools>=50 ; extra == 'mypyc' - lxml ; extra == 'reports' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl name: mypy version: 1.12.1 - url: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl sha256: 02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66 requires_dist: - typing-extensions>=4.6.0 @@ -1061,34 +888,12 @@ packages: - setuptools>=50 ; extra == 'mypyc' - lxml ; extra == 'reports' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl name: mypy-extensions version: 1.0.0 - url: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl sha256: 4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d requires_python: '>=3.5' -- kind: conda - name: ncurses - version: '6.5' - build: h7bae524_1 - build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda - sha256: 27d0b9ff78ad46e1f3a6c96c479ab44beda5f96def88e2fe626e0a49429d8afc - md5: cb2b0ea909b97b3d70cd3921d1445e1a - depends: - - __osx >=11.0 - license: X11 AND BSD-3-Clause - purls: [] - size: 802321 - timestamp: 1724658775723 -- kind: conda - name: ncurses - version: '6.5' - build: he02047a_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda sha256: 6a1d5d8634c1a07913f1c525db6455918cbc589d745fac46d9d6e30340c8731a md5: 70caf8bb6cf39a0b6b7efc885f51c0fe depends: @@ -1098,13 +903,7 @@ packages: purls: [] size: 889086 timestamp: 1724658547447 -- kind: conda - name: ncurses - version: '6.5' - build: hf036a51_1 - build_number: 1 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda +- conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda sha256: b0b3180039ef19502525a2abd5833c00f9624af830fd391f851934d57bffb9af md5: e102bbf8a6ceeaf429deab8032fc8977 depends: @@ -1113,100 +912,61 @@ packages: purls: [] size: 822066 timestamp: 1724658603042 -- kind: pypi +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda + sha256: 27d0b9ff78ad46e1f3a6c96c479ab44beda5f96def88e2fe626e0a49429d8afc + md5: cb2b0ea909b97b3d70cd3921d1445e1a + depends: + - __osx >=11.0 + license: X11 AND BSD-3-Clause + purls: [] + size: 802321 + timestamp: 1724658775723 +- pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl name: nodeenv version: 1.9.1 - url: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl sha256: ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl name: nodejs-wheel-binaries version: 20.18.0 - url: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl sha256: f95fb0989dfc54fd6932850e589000a8d6fc902527cebe7afd747696561d94b8 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: nodejs-wheel-binaries version: 20.18.0 - url: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: 33b138288dbeb9aafc6d54f43fbca6545b37e8fd9cbb8f68275ff2a47d4fed07 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl name: nodejs-wheel-binaries version: 20.18.0 - url: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl sha256: 74273eab1c2423c04d034d3f707f517da32d3a2b20ca244b5667f3a4e38003ac requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl name: nodejs-wheel-binaries version: 20.18.0 - url: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl sha256: 51c0cecb429a111351a54346909e672a57b96233a363c79cc0a2bbdbfa397304 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl name: numpy version: 2.1.2 - url: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl sha256: faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1 requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: numpy version: 2.1.2 - url: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: e2b49c3c0804e8ecb05d59af8386ec2f74877f7ca8fd9c1e00be2672e4d399b1 requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl name: numpy version: 2.1.2 - url: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl sha256: b42a1a511c81cc78cbc4539675713bbcf9d9c3913386243ceff0e9429ca892fe requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl name: numpy version: 2.1.2 - url: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl sha256: f1eb068ead09f4994dec71c24b2844f1e4e4e013b9629f812f292f04bd1510d9 requires_python: '>=3.10' -- kind: conda - name: openssl - version: 3.3.2 - build: h2466b09_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda - sha256: a45c42f3577294e22ac39ddb6ef5a64fd5322e8a6725afefbf4f2b4109340bf9 - md5: 1dc86753693df5e3326bb8a85b74c589 - depends: - - ca-certificates - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: Apache-2.0 - license_family: Apache - purls: [] - size: 8396053 - timestamp: 1725412961673 -- kind: conda - name: openssl - version: 3.3.2 - build: h8359307_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda - sha256: 940fa01c4dc6152158fe8943e05e55a1544cab639df0994e3b35937839e4f4d1 - md5: 1773ebccdc13ec603356e8ff1db9e958 - depends: - - __osx >=11.0 - - ca-certificates - license: Apache-2.0 - license_family: Apache - purls: [] - size: 2882450 - timestamp: 1725410638874 -- kind: conda - name: openssl - version: 3.3.2 - build: hb9d3cd8_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda sha256: cee91036686419f6dd6086902acf7142b4916e1c4ba042e9ca23e151da012b6d md5: 4d638782050ab6faa27275bed57e9b4e depends: @@ -1218,12 +978,7 @@ packages: purls: [] size: 2891789 timestamp: 1725410790053 -- kind: conda - name: openssl - version: 3.3.2 - build: hd23fc13_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda +- conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda sha256: 2b75d4b56e45992adf172b158143742daeb316c35274b36f385ccb6644e93268 md5: 2ff47134c8e292868a4609519b1ea3b6 depends: @@ -1234,25 +989,46 @@ packages: purls: [] size: 2544654 timestamp: 1725410973572 -- kind: pypi +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda + sha256: 940fa01c4dc6152158fe8943e05e55a1544cab639df0994e3b35937839e4f4d1 + md5: 1773ebccdc13ec603356e8ff1db9e958 + depends: + - __osx >=11.0 + - ca-certificates + license: Apache-2.0 + license_family: Apache + purls: [] + size: 2882450 + timestamp: 1725410638874 +- conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda + sha256: a45c42f3577294e22ac39ddb6ef5a64fd5322e8a6725afefbf4f2b4109340bf9 + md5: 1dc86753693df5e3326bb8a85b74c589 + depends: + - ca-certificates + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 8396053 + timestamp: 1725412961673 +- pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl name: packaging version: '24.1' - url: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl sha256: 5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl name: pandas-stubs version: 2.2.3.241009 - url: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl sha256: 3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa requires_dist: - numpy>=1.23.5 - types-pytz>=2022.1.1 requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl name: parso version: 0.8.4 - url: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl sha256: a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18 requires_dist: - flake8==5.0.4 ; extra == 'qa' @@ -1261,27 +1037,18 @@ packages: - docopt ; extra == 'testing' - pytest ; extra == 'testing' requires_python: '>=3.6' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl name: pathspec version: 0.12.1 - url: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl name: pexpect version: 4.9.0 - url: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 requires_dist: - ptyprocess>=0.5 -- kind: conda - name: pip - version: '24.2' - build: pyh8b19718_1 - build_number: 1 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda sha256: d820e5358bcb117fa6286e55d4550c60b0332443df62121df839eab2d11c890b md5: 6c78fbb8ddfd64bcb55b5cbafd2d2c43 depends: @@ -1294,10 +1061,9 @@ packages: - pkg:pypi/pip?source=hash-mapping size: 1237976 timestamp: 1724954490262 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl name: platformdirs version: 4.3.6 - url: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl sha256: 73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb requires_dist: - furo>=2024.8.6 ; extra == 'docs' @@ -1311,10 +1077,9 @@ packages: - pytest>=8.3.2 ; extra == 'test' - mypy>=1.11.2 ; extra == 'type' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl name: pluggy version: 1.5.0 - url: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl sha256: 44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 requires_dist: - pre-commit ; extra == 'dev' @@ -1322,10 +1087,9 @@ packages: - pytest ; extra == 'testing' - pytest-benchmark ; extra == 'testing' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl name: pre-commit version: 4.0.1 - url: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl sha256: efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878 requires_dist: - cfgv>=2.0.0 @@ -1334,30 +1098,26 @@ packages: - pyyaml>=5.1 - virtualenv>=20.10.0 requires_python: '>=3.9' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl name: prompt-toolkit version: 3.0.48 - url: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl sha256: f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e requires_dist: - wcwidth requires_python: '>=3.7.0' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl name: ptyprocess version: 0.7.0 - url: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl name: pure-eval version: 0.2.3 - url: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 requires_dist: - pytest ; extra == 'tests' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl sha256: a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03 requires_dist: - numpy>=1.16.6 @@ -1367,10 +1127,9 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl sha256: e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4 requires_dist: - numpy>=1.16.6 @@ -1380,10 +1139,9 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl sha256: 2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3 requires_dist: - numpy>=1.16.6 @@ -1393,10 +1151,9 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl name: pyarrow version: 17.0.0 - url: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl sha256: 1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977 requires_dist: - numpy>=1.16.6 @@ -1406,27 +1163,24 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.8' -- kind: pypi +- pypi: . name: pyarrow-stubs version: '17.12' - path: . - sha256: 0438bdb5aab19874198fda9e04ff0fb7dacbe7f93164168be33a8331083bb05e + sha256: 2f77b4b3d47b6cb5662dbcf6035b5c5162910cfa5130405884b59071ed210bfa requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' editable: true -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl name: pygments version: 2.18.0 - url: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl sha256: b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a requires_dist: - colorama>=0.4.6 ; extra == 'windows-terminal' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl name: pyright version: 1.1.385 - url: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl sha256: e5b9a1b8d492e13004d822af94d07d235f2c7c158457293b51ab2214c8c5b375 requires_dist: - nodeenv>=1.6.0 @@ -1436,19 +1190,18 @@ packages: - twine>=3.4.1 ; extra == 'dev' - nodejs-wheel-binaries ; extra == 'nodejs' requires_python: '>=3.7' -- kind: conda - name: python - version: 3.11.0 - build: h3ba56d0_1_cpython +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda - sha256: 28a54d78cd2624a12bd2ceb0f1816b0cba9b4fd97df846b5843b3c1d51642ab2 - md5: 2aa7ca3702d9afd323ca34a9d98879d1 + sha256: 464f998e406b645ba34771bb53a0a7c2734e855ee78dd021aa4dedfdb65659b7 + md5: 8d14fc2aa12db370a443753c8230be1e depends: - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 - libffi >=3.4,<4.0a0 + - libgcc-ng >=12 + - libnsl >=2.0.0,<2.1.0a0 - libsqlite >=3.40.0,<4.0a0 + - libuuid >=2.32.1,<3.0a0 - libzlib >=1.2.13,<2.0.0a0 - ncurses >=6.3,<7.0a0 - openssl >=3.0.7,<4.0a0 @@ -1460,50 +1213,37 @@ packages: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 14492975 - timestamp: 1673699560906 -- kind: conda - name: python - version: 3.11.0 - build: hcf16a7b_0_cpython - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 - sha256: 20d1f1b5dc620b745c325844545fd5c0cdbfdb2385a0e27ef1507399844c8c6d - md5: 13ee3577afc291dabd2d9edc59736688 + size: 31476523 + timestamp: 1673700777998 +- conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda + build_number: 1 + sha256: 5c069c9908e48a4490a56d3752c0bc93c2fc93ab8d8328efc869fdc707618e9f + md5: 9ecfa530b33aefd0d22e0272336f638a depends: - bzip2 >=1.0.8,<2.0a0 - - libffi >=3.4.2,<3.5.0a0 - - libsqlite >=3.39.4,<4.0a0 + - libffi >=3.4,<4.0a0 + - libsqlite >=3.40.0,<4.0a0 - libzlib >=1.2.13,<2.0.0a0 - - openssl >=3.0.5,<4.0a0 + - ncurses >=6.3,<7.0a0 + - openssl >=3.0.7,<4.0a0 + - readline >=8.1.2,<9.0a0 - tk >=8.6.12,<8.7.0a0 - tzdata - - vc >=14.1,<15 - - vs2015_runtime >=14.16.27033 - - xz >=5.2.6,<5.3.0a0 + - xz >=5.2.6,<6.0a0 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 19819816 - timestamp: 1666678800085 -- kind: conda - name: python - version: 3.11.0 - build: he550d4f_1_cpython + size: 15410083 + timestamp: 1673762717308 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda - sha256: 464f998e406b645ba34771bb53a0a7c2734e855ee78dd021aa4dedfdb65659b7 - md5: 8d14fc2aa12db370a443753c8230be1e + sha256: 28a54d78cd2624a12bd2ceb0f1816b0cba9b4fd97df846b5843b3c1d51642ab2 + md5: 2aa7ca3702d9afd323ca34a9d98879d1 depends: - bzip2 >=1.0.8,<2.0a0 - - ld_impl_linux-64 >=2.36.1 - libffi >=3.4,<4.0a0 - - libgcc-ng >=12 - - libnsl >=2.0.0,<2.1.0a0 - libsqlite >=3.40.0,<4.0a0 - - libuuid >=2.32.1,<3.0a0 - libzlib >=1.2.13,<2.0.0a0 - ncurses >=6.3,<7.0a0 - openssl >=3.0.7,<4.0a0 @@ -1515,65 +1255,49 @@ packages: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 31476523 - timestamp: 1673700777998 -- kind: conda - name: python - version: 3.11.0 - build: he7542f4_1_cpython - build_number: 1 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda - sha256: 5c069c9908e48a4490a56d3752c0bc93c2fc93ab8d8328efc869fdc707618e9f - md5: 9ecfa530b33aefd0d22e0272336f638a + size: 14492975 + timestamp: 1673699560906 +- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 + sha256: 20d1f1b5dc620b745c325844545fd5c0cdbfdb2385a0e27ef1507399844c8c6d + md5: 13ee3577afc291dabd2d9edc59736688 depends: - bzip2 >=1.0.8,<2.0a0 - - libffi >=3.4,<4.0a0 - - libsqlite >=3.40.0,<4.0a0 + - libffi >=3.4.2,<3.5.0a0 + - libsqlite >=3.39.4,<4.0a0 - libzlib >=1.2.13,<2.0.0a0 - - ncurses >=6.3,<7.0a0 - - openssl >=3.0.7,<4.0a0 - - readline >=8.1.2,<9.0a0 + - openssl >=3.0.5,<4.0a0 - tk >=8.6.12,<8.7.0a0 - tzdata - - xz >=5.2.6,<6.0a0 + - vc >=14.1,<15 + - vs2015_runtime >=14.16.27033 + - xz >=5.2.6,<5.3.0a0 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 15410083 - timestamp: 1673762717308 -- kind: pypi + size: 19819816 + timestamp: 1666678800085 +- pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: 3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl sha256: 1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl sha256: e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl name: pyyaml version: 6.0.2 - url: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl sha256: cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 requires_python: '>=3.8' -- kind: conda - name: readline - version: '8.2' - build: h8228510_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda sha256: 5435cf39d039387fbdc977b0a762357ea909a7694d9528ab40f005e9208744d7 md5: 47d31b792659ce70f470b5c82fdfb7a4 depends: @@ -1584,66 +1308,49 @@ packages: purls: [] size: 281456 timestamp: 1679532220005 -- kind: conda - name: readline - version: '8.2' - build: h92ec313_1 - build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda - sha256: a1dfa679ac3f6007362386576a704ad2d0d7a02e98f5d0b115f207a2da63e884 - md5: 8cbb776a2f641b943d413b3e19df71f4 +- conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda + sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568 + md5: f17f77f2acf4d344734bda76829ce14e depends: - ncurses >=6.3,<7.0a0 license: GPL-3.0-only license_family: GPL purls: [] - size: 250351 - timestamp: 1679532511311 -- kind: conda - name: readline - version: '8.2' - build: h9e318b2_1 - build_number: 1 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda - sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568 - md5: f17f77f2acf4d344734bda76829ce14e + size: 255870 + timestamp: 1679532707590 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda + sha256: a1dfa679ac3f6007362386576a704ad2d0d7a02e98f5d0b115f207a2da63e884 + md5: 8cbb776a2f641b943d413b3e19df71f4 depends: - ncurses >=6.3,<7.0a0 license: GPL-3.0-only license_family: GPL purls: [] - size: 255870 - timestamp: 1679532707590 -- kind: pypi + size: 250351 + timestamp: 1679532511311 +- pypi: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl name: ruff version: 0.7.0 - url: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl sha256: ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: ruff version: 0.7.0 - url: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl name: ruff version: 0.7.0 - url: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl sha256: 214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl name: ruff version: 0.7.0 - url: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl sha256: 496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737 requires_python: '>=3.7' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl sha256: fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 requires_dist: - numpy>=1.23.5,<2.3 @@ -1682,10 +1389,9 @@ packages: - doit>=0.36.0 ; extra == 'dev' - pydevtool ; extra == 'dev' requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl sha256: c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 requires_dist: - numpy>=1.23.5,<2.3 @@ -1724,10 +1430,9 @@ packages: - doit>=0.36.0 ; extra == 'dev' - pydevtool ; extra == 'dev' requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl sha256: 2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 requires_dist: - numpy>=1.23.5,<2.3 @@ -1766,10 +1471,9 @@ packages: - doit>=0.36.0 ; extra == 'dev' - pydevtool ; extra == 'dev' requires_python: '>=3.10' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl name: scipy version: 1.14.1 - url: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl sha256: 716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 requires_dist: - numpy>=1.23.5,<2.3 @@ -1808,13 +1512,7 @@ packages: - doit>=0.36.0 ; extra == 'dev' - pydevtool ; extra == 'dev' requires_python: '>=3.10' -- kind: conda - name: setuptools - version: 75.1.0 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda sha256: 6725235722095c547edd24275053c615158d6163f396550840aebd6e209e4738 md5: d5cd48392c67fb6849ba459c2c2b671f depends: @@ -1825,16 +1523,14 @@ packages: - pkg:pypi/setuptools?source=hash-mapping size: 777462 timestamp: 1727249510532 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl name: six version: 1.16.0 - url: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl sha256: 8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl name: stack-data version: 0.6.3 - url: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 requires_dist: - executing>=1.2.0 @@ -1845,13 +1541,18 @@ packages: - pygments ; extra == 'tests' - littleutils ; extra == 'tests' - cython ; extra == 'tests' -- kind: conda - name: tk - version: 8.6.13 - build: h1abcd95_1 - build_number: 1 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda + sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e + md5: d453b98d9c83e71da0741bb0ff4d76bc + depends: + - libgcc-ng >=12 + - libzlib >=1.2.13,<2.0.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3318875 + timestamp: 1699202167581 +- conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda sha256: 30412b2e9de4ff82d8c2a7e5d06a15f4f4fef1809a72138b6ccb53a33b26faf5 md5: bf830ba5afc507c6232d4ef0fb1a882d depends: @@ -1861,13 +1562,7 @@ packages: purls: [] size: 3270220 timestamp: 1699202389792 -- kind: conda - name: tk - version: 8.6.13 - build: h5083fa2_1 - build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda sha256: 72457ad031b4c048e5891f3f6cb27a53cb479db68a52d965f796910e71a403a8 md5: b50a57ba89c32b62428b71a875291c9b depends: @@ -1877,13 +1572,7 @@ packages: purls: [] size: 3145523 timestamp: 1699202432999 -- kind: conda - name: tk - version: 8.6.13 - build: h5226925_1 - build_number: 1 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda sha256: 2c4e914f521ccb2718946645108c9bd3fc3216ba69aea20c2c3cedbd8db32bb1 md5: fc048363eb8f03cd1737600a5d08aafe depends: @@ -1895,27 +1584,9 @@ packages: purls: [] size: 3503410 timestamp: 1699202577803 -- kind: conda - name: tk - version: 8.6.13 - build: noxft_h4845f30_101 - build_number: 101 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e - md5: d453b98d9c83e71da0741bb0ff4d76bc - depends: - - libgcc-ng >=12 - - libzlib >=1.2.13,<2.0.0a0 - license: TCL - license_family: BSD - purls: [] - size: 3318875 - timestamp: 1699202167581 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl name: traitlets version: 5.14.3 - url: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f requires_dist: - myst-parser ; extra == 'docs' @@ -1928,57 +1599,40 @@ packages: - pytest-mypy-testing ; extra == 'test' - pytest>=7.0,<8.2 ; extra == 'test' requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl name: trove-classifiers version: 2024.10.16 - url: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl sha256: 9b02a4cb49bd2e85c13e728ee461f4f332d6334736b18d61254c964643687144 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl name: types-cffi version: 1.16.0.20240331 - url: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl sha256: a363e5ea54a4eb6a4a105d800685fde596bc318089b025b27dee09849fe41ff0 requires_dist: - types-setuptools requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl name: types-pytz version: 2024.2.0.20241003 - url: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl sha256: 3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl name: types-setuptools version: 75.2.0.20241019 - url: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl sha256: 2e48ff3acd4919471e80d5e3f049cce5c177e108d5d36d2d4cee3fa4d4104258 requires_python: '>=3.8' -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl name: typing-extensions version: 4.12.2 - url: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl sha256: 04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d requires_python: '>=3.8' -- kind: conda - name: tzdata - version: 2024b - build: hc8b5060_0 - subdir: noarch - noarch: generic - url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf md5: 8ac3367aafb1cc0a068483c580af8015 license: LicenseRef-Public-Domain purls: [] size: 122354 timestamp: 1728047496079 -- kind: conda - name: ucrt - version: 10.0.22621.0 - build: h57928b3_1 - build_number: 1 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda sha256: db8dead3dd30fb1a032737554ce91e2819b43496a0db09927edf01c32b577450 md5: 6797b005cd0f439c4c5c9ac565783700 constrains: @@ -1987,13 +1641,7 @@ packages: purls: [] size: 559710 timestamp: 1728377334097 -- kind: conda - name: vc - version: '14.3' - build: ha32ba9b_22 - build_number: 22 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda sha256: 2a47c5bd8bec045959afada7063feacd074ad66b170c1ea92dd139b389fcf8fd md5: 311c9ba1dfdd2895a8cb08346ff26259 depends: @@ -2005,13 +1653,7 @@ packages: purls: [] size: 17447 timestamp: 1728400826998 -- kind: conda - name: vc14_runtime - version: 14.40.33810 - build: hcc2c482_22 - build_number: 22 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda sha256: 4c669c65007f88a7cdd560192f7e6d5679d191ac71610db724e18b2410964d64 md5: ce23a4b980ee0556a118ed96550ff3f3 depends: @@ -2023,10 +1665,9 @@ packages: purls: [] size: 750719 timestamp: 1728401055788 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl name: virtualenv version: 20.27.0 - url: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl sha256: 44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655 requires_dist: - distlib>=0.3.7,<1 @@ -2053,13 +1694,7 @@ packages: - setuptools>=68 ; extra == 'test' - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' requires_python: '>=3.8' -- kind: conda - name: vs2015_runtime - version: 14.40.33810 - build: h3bf8584_22 - build_number: 22 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda +- conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda sha256: 80aa9932203d65a96f817b8be4fafc176fb2b3fe6cf6899ede678b8f0317fbff md5: 8c6b061d44cafdfc8e8c6eb5f100caf0 depends: @@ -2069,20 +1704,13 @@ packages: purls: [] size: 17453 timestamp: 1728400827536 -- kind: pypi +- pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl name: wcwidth version: 0.2.13 - url: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 requires_dist: - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' -- kind: conda - name: wheel - version: 0.44.0 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda sha256: d828764736babb4322b8102094de38074dedfc71f5ff405c9dfee89191c14ebc md5: d44e3b085abcaef02983c6305b84b584 depends: @@ -2093,12 +1721,7 @@ packages: - pkg:pypi/wheel?source=hash-mapping size: 58585 timestamp: 1722797131787 -- kind: conda - name: xz - version: 5.2.6 - build: h166bdaf_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 sha256: 03a6d28ded42af8a347345f82f3eebdd6807a08526d47899a42d62d319609162 md5: 2161070d867d1b1204ea749c8eec4ef0 depends: @@ -2107,36 +1730,21 @@ packages: purls: [] size: 418368 timestamp: 1660346797927 -- kind: conda - name: xz - version: 5.2.6 - build: h57fd34a_0 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 - sha256: 59d78af0c3e071021cfe82dc40134c19dab8cdf804324b62940f5c8cd71803ec - md5: 39c6b54e94014701dd157f4f576ed211 - license: LGPL-2.1 and GPL-2.0 - purls: [] - size: 235693 - timestamp: 1660346961024 -- kind: conda - name: xz - version: 5.2.6 - build: h775f41a_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 sha256: eb09823f34cc2dd663c0ec4ab13f246f45dcd52e5b8c47b9864361de5204a1c8 md5: a72f9d4ea13d55d745ff1ed594747f10 license: LGPL-2.1 and GPL-2.0 purls: [] size: 238119 timestamp: 1660346964847 -- kind: conda - name: xz - version: 5.2.6 - build: h8d14728_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 + sha256: 59d78af0c3e071021cfe82dc40134c19dab8cdf804324b62940f5c8cd71803ec + md5: 39c6b54e94014701dd157f4f576ed211 + license: LGPL-2.1 and GPL-2.0 + purls: [] + size: 235693 + timestamp: 1660346961024 +- conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 sha256: 54d9778f75a02723784dc63aff4126ff6e6749ba21d11a6d03c1f4775f269fe0 md5: 515d77642eaa3639413c6b1bc3f94219 depends: diff --git a/pyproject.toml b/pyproject.toml index feaeae596be..711192833c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,8 +72,9 @@ extend-select = [ "PYI", # flake8-pyi ] ignore = [ - "PYI015", # assignment-default-in-stub "PYI011", # typed-argument-default-in-stub + "PYI015", # assignment-default-in-stub + "PYI063", # pep484-style-positional-only-parameter "N818", # error-suffix-on-exception-name ] From c5e18752502597ddecf3a8f9058bce6cad47eab9 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 6 Dec 2024 10:48:31 +0800 Subject: [PATCH 130/231] release 17.13 (#151) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 8a02a27c0b0..ec420d325b8 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.12' - sha256: 2f77b4b3d47b6cb5662dbcf6035b5c5162910cfa5130405884b59071ed210bfa + version: '17.13' + sha256: ff54e71868ae632b2a283cf6b7bde18a113fe7a658f57bd8349872a9928ba7bc requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 711192833c0..84aca03a04b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.12" +version = "17.13" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From f0b575dd745bcba879d2527bd2ad748925c2b2f6 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 9 Dec 2024 10:22:18 +0800 Subject: [PATCH 131/231] fix: FileSystem metadata value should be str (#152) --- pyarrow-stubs/_fs.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 23ed3c27387..2484d1770d5 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -87,14 +87,14 @@ class FileSystem(_Weakrefable): path: str, compression: str | None = "detect", buffer_size: int | None = None, - metadata: dict[str, list[str]] | None = None, + metadata: dict[str, str] | None = None, ) -> NativeFile: ... def open_append_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None, - metadata: dict[str, list[str]] | None = None, + metadata: dict[str, str] | None = None, ): ... def normalize_path(self, path: str) -> str: ... From a2380d0bc6d3a2ea19261b3f2161d3ce191adf9b Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 9 Dec 2024 10:24:11 +0800 Subject: [PATCH 132/231] fix: FileSystemHandler metadata value should be str (#153) --- pyarrow-stubs/_fs.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 2484d1770d5..c17d987d16d 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -142,8 +142,8 @@ class FileSystemHandler(ABC): @abstractmethod def open_input_file(self, path: str) -> NativeFile: ... @abstractmethod - def open_output_stream(self, path: str, metadata: dict[str, list[str]]) -> NativeFile: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... @abstractmethod - def open_append_stream(self, path: str, metadata: dict[str, list[str]]) -> NativeFile: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... @abstractmethod def normalize_path(self, path: str) -> str: ... From 78cce86adec39c12ed3ee18c2843ff51221d4ea4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 10:18:23 +0800 Subject: [PATCH 133/231] [pre-commit.ci] pre-commit autoupdate (#154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.1 → v0.8.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.1...v0.8.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index acd19e2ae14..6b52bc58f25 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.8.2 hooks: - id: ruff args: [--fix] From 1432d63d4e97269c14645a42f77db4a6756d2f0c Mon Sep 17 00:00:00 2001 From: ben-freist <93315290+ben-freist@users.noreply.github.com> Date: Sat, 14 Dec 2024 09:45:42 +0100 Subject: [PATCH 134/231] improve coverage for pyarrow.struct typehint (#157) --- pyarrow-stubs/__lib_pxi/types.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 7501c1ce88e..05973a1db5e 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -426,7 +426,7 @@ def dictionary( index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered ) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... def struct( - fields: Iterable[Field | tuple[str, Field]] | Mapping[str, Field], + fields: Iterable[Field | tuple[str, Field] | tuple[str, DataType]] | Mapping[str, Field], ) -> StructType: ... def sparse_union( child_fields: list[Field], type_codes: list[int] | None = None From cedd690b5dc4def3b72c0b59f513a808f95e8718 Mon Sep 17 00:00:00 2001 From: Jiahao Yuan Date: Mon, 16 Dec 2024 10:25:47 +0800 Subject: [PATCH 135/231] fix: ipc typing (#159) --- pyarrow-stubs/__lib_pxi/ipc.pyi | 8 ++++++-- pyarrow-stubs/ipc.pyi | 14 +++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi index f768bb185cb..6400ac033fa 100644 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -41,9 +41,13 @@ class ReadStats(NamedTuple): class IpcReadOptions(_Weakrefable): ensure_native_endian: bool use_threads: bool - include_fields: list + included_fields: list[int] def __init__( - self, *, ensure_native_endian: bool = True, use_threads: bool = True, include_fields: list + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, ) -> None: ... class IpcWriteOptions(_Weakrefable): diff --git a/pyarrow-stubs/ipc.pyi b/pyarrow-stubs/ipc.pyi index bedcaecaa5b..c7f2af004d4 100644 --- a/pyarrow-stubs/ipc.pyi +++ b/pyarrow-stubs/ipc.pyi @@ -27,7 +27,7 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader): self, source: bytes | lib.Buffer | lib.NativeFile | IOBase, *, - options: IpcReadOptions | None, + options: IpcReadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> None: ... @@ -57,8 +57,8 @@ class RecordBatchFileWriter(lib._RecordBatchFileWriter): sink: str | lib.NativeFile | IOBase, schema: lib.Schema, *, - options: IpcReadOptions | None, - memory_pool: lib.MemoryPool | None = None, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, ) -> None: ... def new_stream( @@ -71,21 +71,21 @@ def new_stream( def open_stream( source: bytes | lib.Buffer | lib.NativeFile | IOBase, *, - options: IpcReadOptions | None, + options: IpcReadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchStreamReader: ... def new_file( sink: str | lib.NativeFile | IOBase, schema: lib.Schema, *, - options: IpcReadOptions | None, - memory_pool: lib.MemoryPool | None = None, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, ) -> RecordBatchFileWriter: ... def open_file( source: bytes | lib.Buffer | lib.NativeFile | IOBase, footer_offset: int | None = None, *, - options: IpcReadOptions | None, + options: IpcReadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> RecordBatchFileReader: ... def serialize_pandas( From b1edf35d7fe23ba97a0057a878340a13b7f77a9f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 16 Dec 2024 10:28:02 +0800 Subject: [PATCH 136/231] release 17.14 (#160) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index ec420d325b8..198de02e2cf 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.13' - sha256: ff54e71868ae632b2a283cf6b7bde18a113fe7a658f57bd8349872a9928ba7bc + version: '17.14' + sha256: bc183632c251bc2debaa65d51232ee6e2e36ea19300688ee3ed870f1e6621f9b requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 84aca03a04b..8102f49352a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.13" +version = "17.14" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 51607ee9592fb63974fb394b030c291b12524df7 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 20 Dec 2024 10:04:39 +0800 Subject: [PATCH 137/231] fix: add missing param 'nbytes' to NativeFile.read (#163) --- pyarrow-stubs/__lib_pxi/io.pyi | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi index 8ec9a71bcd3..d14eaa3937b 100644 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -48,9 +48,7 @@ class NativeFile(_Weakrefable): def seek(self, position: int, whence: int = 0) -> int: ... def flush(self) -> None: ... def write(self, data: bytes | SupportPyBuffer) -> int: ... - def read( - self, - ) -> bytes: ... + def read(self, nbytes: int | None = None) -> bytes: ... def get_stream(self, file_offset: int, nbytes: int) -> Self: ... def read_at(self) -> bytes: ... def read1(self) -> bytes: ... From c8d236cf7464bdce26fa2af371552d26014f156f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 20 Dec 2024 10:06:38 +0800 Subject: [PATCH 138/231] release 17.15 (#164) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 198de02e2cf..8663eb75f70 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.14' - sha256: bc183632c251bc2debaa65d51232ee6e2e36ea19300688ee3ed870f1e6621f9b + version: '17.15' + sha256: bd51960a88f4ac3c62c02baf6edb2c986407127ebf1c7c0b90c837602113e862 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 8102f49352a..0010f22b1bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.14" +version = "17.15" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 39e7e2169372081b318875f72c5d7ca9960514f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:07:50 +0800 Subject: [PATCH 139/231] [pre-commit.ci] pre-commit autoupdate (#161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.2 → v0.8.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.2...v0.8.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b52bc58f25..e891edfa0f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.2 + rev: v0.8.3 hooks: - id: ruff args: [--fix] From bd078682efb7d5bb574b9060aeb6736f67abc08d Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jan 2025 21:57:30 -0500 Subject: [PATCH 140/231] Add 'None' as a valid argument for partitioning to the various parquet reading functions (#166) --- pyarrow-stubs/parquet/core.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index de2716a391b..ac1d6db8aef 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -188,7 +188,7 @@ class ParquetDataset: read_dictionary: list[str] | None = None, memory_map: bool = False, buffer_size: int = 0, - partitioning: str | list[str] | Partitioning = "hive", + partitioning: str | list[str] | Partitioning | None = "hive", ignore_prefixes: list[str] | None = None, pre_buffer: bool = True, coerce_int96_timestamp_unit: str | None = None, @@ -227,7 +227,7 @@ def read_table( read_dictionary: list[str] | None = None, memory_map: bool = False, buffer_size: int = 0, - partitioning: str | list[str] | Partitioning = "hive", + partitioning: str | list[str] | Partitioning | None = "hive", filesystem: SupportedFileSystem | None = None, filters: Expression | FilterTuple | list[FilterTuple] | None = None, use_legacy_dataset: bool | None = None, From 2ab76bf1dd66e67cc2946dda93ca51c2d3f348b2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:58:10 +0800 Subject: [PATCH 141/231] [pre-commit.ci] pre-commit autoupdate (#165) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.3 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.3...v0.8.6) - [github.com/pre-commit/mirrors-mypy: v1.13.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.13.0...v1.14.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e891edfa0f9..6cbd25ea09a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,13 +19,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.6 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 + rev: v1.14.1 hooks: - id: mypy From 08a68ca9f5af0858d1fc74ee5fdf5d7fa055cf7a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sun, 19 Jan 2025 15:08:29 +0800 Subject: [PATCH 142/231] fix: should use Collection[Array] instead list[Array] (#170) "List" is invariant -- see https://mypy.readthedocs.io/en/stable/common_issues.html#variance Consider using "Sequence" instead, which is covariant --- pyarrow-stubs/__lib_pxi/array.pyi | 7 ++++--- pyarrow-stubs/__lib_pxi/table.pyi | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 741eae99df1..19db0c33834 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -16,6 +16,7 @@ else: from typing_extensions import TypeAlias from typing import ( Any, + Collection, Generic, Iterable, Iterator, @@ -1095,7 +1096,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): buffers: list[Buffer], null_count: int = -1, offset=0, - children: list[Array[Scalar[_DataTypeT]]] | None = None, + children: Collection[Array[Scalar[_DataTypeT]]] | None = None, ) -> Array[Scalar[_DataTypeT]]: ... @property def null_count(self) -> int: ... @@ -1403,14 +1404,14 @@ class UnionArray(Array[scalar.UnionScalar]): def from_dense( types: Int8Array, value_offsets: Int32Array, - children: list[Array], + children: Collection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, ) -> UnionArray: ... @staticmethod def from_sparse( types: Int8Array, - children: list[Array], + children: Collection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, ) -> UnionArray: ... diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 4a297092ca8..733733ae5da 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -15,6 +15,7 @@ else: from typing_extensions import TypeAlias from typing import ( Any, + Collection, Generator, Generic, Iterable, @@ -469,7 +470,7 @@ class RecordBatch(_Tabular[Array]): @classmethod def from_arrays( cls, - arrays: list[Array] | list[ChunkedArray], + arrays: Collection[Array] | Collection[ChunkedArray], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, @@ -550,7 +551,7 @@ class Table(_Tabular[ChunkedArray]): @classmethod def from_arrays( cls, - arrays: list[Array] | list[ChunkedArray], + arrays: Collection[Array] | Collection[ChunkedArray], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, @@ -601,7 +602,7 @@ class Table(_Tabular[ChunkedArray]): def record_batch( data: dict[str, list | Array] - | list[Array] + | Collection[Array] | pd.DataFrame | SupportArrowArray | SupportArrowDeviceArray, @@ -618,7 +619,7 @@ def table( ) -> Table: ... @overload def table( - data: list[Array | ChunkedArray] + data: Collection[Array | ChunkedArray] | pd.DataFrame | SupportArrowArray | SupportArrowStream From 2015653e8c97b9756556b5f294d3a21d47b6fee1 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sun, 19 Jan 2025 15:20:35 +0800 Subject: [PATCH 143/231] fix: update type hints for path_or_paths and source parameters in ParquetDataset and read_table (#171) --- pyarrow-stubs/parquet/core.pyi | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index ac1d6db8aef..d3ac8f9f976 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -13,7 +13,6 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from _typeshed import StrPath from pyarrow import _parquet from pyarrow._compute import Expression from pyarrow._fs import FileSystem, SupportedFileSystem @@ -180,7 +179,10 @@ class ParquetWriter: class ParquetDataset: def __init__( self, - path_or_paths: SingleOrList[StrPath | NativeFile | IO], + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], filesystem: SupportedFileSystem | None = None, schema: Schema | None = None, *, @@ -218,7 +220,7 @@ class ParquetDataset: def partitioning(self) -> Partitioning: ... def read_table( - source: SingleOrList[StrPath | NativeFile | IO], + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], *, columns: list | None = None, use_threads: bool = True, From b39224fcd97509022aacf855b6fef029eb50cfdd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Jan 2025 15:20:53 +0800 Subject: [PATCH 144/231] [pre-commit.ci] pre-commit autoupdate (#167) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.6 → v0.9.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.6...v0.9.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6cbd25ea09a..48def005ade 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.6 + rev: v0.9.1 hooks: - id: ruff args: [--fix] From ea108cbe8132a2299d34728b9572bb95d03be386 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sun, 19 Jan 2025 15:25:56 +0800 Subject: [PATCH 145/231] release 17.16 (#172) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0010f22b1bc..f4d2a9ae130 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.15" +version = "17.16" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 080bb4ac875e2627107f666132489c6900b0b556 Mon Sep 17 00:00:00 2001 From: Pim de Haan Date: Wed, 5 Feb 2025 06:22:58 +0100 Subject: [PATCH 146/231] Fixed pa.fixed_shape_tensor (#175) --- pyarrow-stubs/__lib_pxi/types.pyi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 05973a1db5e..049a3bce2d1 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -1,7 +1,7 @@ import datetime as dt import sys -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from decimal import Decimal if sys.version_info >= (3, 11): @@ -447,9 +447,9 @@ def run_end_encoded( ) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... def fixed_shape_tensor( value_type: _ValueT, - shape: tuple[list[int], ...], - dim_names: tuple[list[str], ...] | None = None, - permutation: tuple[list[int], ...] | None = None, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, ) -> FixedShapeTensorType[_ValueT]: ... @overload def type_for_alias(name: Literal["null"]) -> NullType: ... From d97063876720e6a5edda7eb15f4efe07c31b8296 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Feb 2025 13:23:23 +0800 Subject: [PATCH 147/231] [pre-commit.ci] pre-commit autoupdate (#173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.1 → v0.9.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.1...v0.9.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48def005ade..2c83fdbe8fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.1 + rev: v0.9.4 hooks: - id: ruff args: [--fix] From 5cad2791376480fd2e962244e8e38cc1569e99cd Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 20 Feb 2025 00:11:24 +0000 Subject: [PATCH 148/231] fix: Preserve generic in `ChunkedArray.type` (#177) --- pyarrow-stubs/__lib_pxi/table.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 733733ae5da..8a84a7f1db3 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -52,7 +52,7 @@ from .io import Buffer from .ipc import RecordBatchReader from .scalar import Int64Scalar, Scalar from .tensor import Tensor -from .types import DataType, _AsPyType, _BasicDataType, _DataTypeT +from .types import _AsPyType, _BasicDataType, _DataType_CoT, _DataTypeT _Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) @@ -60,7 +60,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @property def data(self) -> Self: ... @property - def type(self) -> DataType: ... + def type(self: ChunkedArray[Scalar[_DataType_CoT]]) -> _DataType_CoT: ... def length(self) -> int: ... __len__ = length def to_string( From c0cc658fe41ce9be02162ceafa0f1511fe83347f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 20 Feb 2025 09:58:15 +0800 Subject: [PATCH 149/231] release 17.17 (#178) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 8663eb75f70..7180f2960d8 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.15' - sha256: bd51960a88f4ac3c62c02baf6edb2c986407127ebf1c7c0b90c837602113e862 + version: '17.17' + sha256: 96e9ee04674e9b9b12604c7c7952872e23624bb8de4b03d29a20abd5d58c90d8 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index f4d2a9ae130..1c27886f6ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.16" +version = "17.17" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From a0d47aa2ee59668b82ac1d2e0773b4d856b57705 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 10:24:33 +0800 Subject: [PATCH 150/231] [pre-commit.ci] pre-commit autoupdate (#176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.4 → v0.9.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.4...v0.9.6) - [github.com/pre-commit/mirrors-mypy: v1.14.1 → v1.15.0](https://github.com/pre-commit/mirrors-mypy/compare/v1.14.1...v1.15.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c83fdbe8fa..5a7cd9b68bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,13 +19,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.4 + rev: v0.9.6 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.14.1 + rev: v1.15.0 hooks: - id: mypy From c482d8f146a8cf2cff55cc4060148fe4883f9309 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Thu, 20 Feb 2025 10:24:44 +0800 Subject: [PATCH 151/231] fix: support to construct ListArray with primitive type (#179) --- pyarrow-stubs/__lib_pxi/array.pyi | 61 +++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 19db0c33834..55b71d2d9d7 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1227,7 +1227,7 @@ class ListArray(BaseListArray[_Scalar_CoT]): @classmethod def from_arrays( cls, - offsets: Int32Array, + offsets: Int32Array | list[int], values: Array[Scalar[_DataTypeT]], *, type: None = None, @@ -1238,8 +1238,63 @@ class ListArray(BaseListArray[_Scalar_CoT]): @classmethod def from_arrays( cls, - offsets: Int32Array, - values: Array, + offsets: Int32Array | list[int], + values: list[int], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[float], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[str], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.StringType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[bytes], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list, + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array | list, *, type: _DataTypeT, pool: MemoryPool | None = None, From ef3896fdb942aed17dc3480987cb0d05aee87e16 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 00:43:41 +0000 Subject: [PATCH 152/231] fix: Avoid `chunked_array` overlapping overloads (#183) --- pyarrow-stubs/__lib_pxi/table.pyi | 53 ++++++++++++++----------------- pyarrow-stubs/_stubs_typing.pyi | 6 ++++ 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 8a84a7f1db3..3346239820e 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -37,6 +37,7 @@ from pyarrow._stubs_typing import ( NullEncoding, NullSelectionBehavior, Order, + PyScalar, SupportArrowArray, SupportArrowDeviceArray, SupportArrowStream, @@ -46,7 +47,7 @@ from pyarrow.interchange.dataframe import _PyArrowDataFrame from pyarrow.lib import Field, MemoryPool, MonthDayNano, Schema from . import scalar -from .array import Array, NullableIterable, StructArray, _CastAs, _PandasConvertible +from .array import Array, StructArray, _CastAs, _PandasConvertible from .device import DeviceAllocationType from .io import Buffer from .ipc import RecordBatchReader @@ -145,77 +146,71 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @classmethod def _import_from_c_capsule(cls, stream) -> Self: ... -@overload # type: ignore[overload-overlap] -def chunked_array( - values: NullableIterable[bool], - type: None = None, -) -> ChunkedArray[scalar.BooleanScalar]: ... -@overload -def chunked_array( - values: NullableIterable[int], - type: None = None, -) -> ChunkedArray[scalar.Int64Scalar]: ... @overload def chunked_array( - values: NullableIterable[float], + values: Iterable[bool] | Iterable[int] | Iterable[float], type: None = None, -) -> ChunkedArray[scalar.DoubleScalar]: ... +) -> ( + ChunkedArray[scalar.BooleanScalar] + | ChunkedArray[scalar.Int64Scalar] + | ChunkedArray[scalar.DoubleScalar] +): ... @overload def chunked_array( - values: NullableIterable[Decimal], + values: Iterable[Decimal], type: None = None, ) -> ChunkedArray[scalar.Decimal128Scalar]: ... @overload def chunked_array( - values: NullableIterable[dict[str, Any]], + values: Iterable[dict[str, Any]], type: None = None, ) -> ChunkedArray[scalar.StructScalar]: ... @overload def chunked_array( - values: NullableIterable[dt.datetime], + values: Iterable[dt.datetime] | Iterable[dt.date], type: None = None, -) -> ChunkedArray[scalar.TimestampScalar]: ... +) -> ChunkedArray[scalar.TimestampScalar] | ChunkedArray[scalar.Date32Scalar]: ... @overload def chunked_array( - values: NullableIterable[dt.date], - type: None = None, -) -> ChunkedArray[scalar.Date32Scalar]: ... -@overload -def chunked_array( - values: NullableIterable[dt.time], + values: Iterable[dt.time], type: None = None, ) -> ChunkedArray[scalar.Time64Scalar]: ... @overload def chunked_array( - values: NullableIterable[dt.timedelta], + values: Iterable[dt.timedelta], type: None = None, ) -> ChunkedArray[scalar.DurationScalar]: ... @overload def chunked_array( - values: NullableIterable[MonthDayNano], + values: Iterable[MonthDayNano], type: None = None, ) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... @overload def chunked_array( - values: NullableIterable[str], + values: Iterable[str], type: None = None, ) -> ChunkedArray[scalar.StringScalar]: ... @overload def chunked_array( - values: NullableIterable[bytearray], + values: Iterable[bytearray], type: None = None, ) -> ChunkedArray[scalar.BinaryScalar]: ... @overload def chunked_array( - values: NullableIterable[list], + values: Iterable[list], type: None = None, ) -> ChunkedArray[scalar.ListScalar]: ... @overload def chunked_array( - values: NullableIterable[_Scalar_CoT], + values: Iterable[_Scalar_CoT], type: None = None, ) -> ChunkedArray[_Scalar_CoT]: ... @overload +def chunked_array( + values: Iterable[PyScalar | None], + type: None = None, +) -> ChunkedArray[Any]: ... +@overload def chunked_array( values: Iterable | SupportArrowStream | SupportArrowArray, type: _DataTypeT, diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index 8981dfa3c85..8d665fca56f 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -1,3 +1,6 @@ +import datetime as dt + +from decimal import Decimal from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar import numpy as np @@ -25,6 +28,9 @@ NullEncoding: TypeAlias = Literal["mask", "encode"] NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] Mask: TypeAlias = list[bool | None] | NDArray[np.bool_] | BooleanArray Indices: TypeAlias = list[int] | NDArray[np.integer] | IntegerArray +PyScalar: TypeAlias = ( + bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta +) _T = TypeVar("_T") SingleOrList: TypeAlias = list[_T] | _T From c9e4858238c491bf719c137a1290cf573df1419b Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 00:45:02 +0000 Subject: [PATCH 153/231] fix: Add placeholder annotations to `pc.if_else` (#182) --- pyarrow-stubs/_stubs_typing.pyi | 1 + pyarrow-stubs/compute.pyi | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index 8d665fca56f..dbcb2c5647a 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -10,6 +10,7 @@ from numpy.typing import NDArray from .__lib_pxi.array import BooleanArray, IntegerArray ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any Order: TypeAlias = Literal["ascending", "descending"] JoinType: TypeAlias = Literal[ "left semi", diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 5cb73bee154..d764795f454 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -82,7 +82,7 @@ from pyarrow._compute import register_aggregate_function as register_aggregate_f from pyarrow._compute import register_scalar_function as register_scalar_function from pyarrow._compute import register_tabular_function as register_tabular_function from pyarrow._compute import register_vector_function as register_vector_function - +from pyarrow._stubs_typing import ArrayLike, ScalarLike from . import lib _P = ParamSpec("_P") @@ -1826,7 +1826,14 @@ def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... def coalesce( *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None ) -> _ScalarOrArrayT: ... -def if_else(cond, left, right, /, *, memory_pool: lib.MemoryPool | None = None): ... +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... # ========================= 2.21 Structural transforms ========================= From 9276d8cf5efbc591c230b29b8c225f55f1a862ea Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 3 Mar 2025 00:32:40 +0000 Subject: [PATCH 154/231] fix: Widen `Array` to `Array | ChunkedArray` (#181) --- pyarrow-stubs/_stubs_typing.pyi | 7 ++- pyarrow-stubs/compute.pyi | 102 +++++++++++--------------------- 2 files changed, 38 insertions(+), 71 deletions(-) diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index dbcb2c5647a..6c25aa214ca 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -1,5 +1,6 @@ import datetime as dt +from collections.abc import Sequence from decimal import Decimal from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar @@ -7,7 +8,7 @@ import numpy as np from numpy.typing import NDArray -from .__lib_pxi.array import BooleanArray, IntegerArray +from .compute import BooleanArray, IntegerArray ArrayLike: TypeAlias = Any ScalarLike: TypeAlias = Any @@ -27,8 +28,8 @@ Compression: TypeAlias = Literal[ ] NullEncoding: TypeAlias = Literal["mask", "encode"] NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] -Mask: TypeAlias = list[bool | None] | NDArray[np.bool_] | BooleanArray -Indices: TypeAlias = list[int] | NDArray[np.integer] | IntegerArray +Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer] | IntegerArray PyScalar: TypeAlias = ( bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta ) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index d764795f454..db288d49ab1 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -94,27 +94,38 @@ def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... # ============= compute functions ============= _DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) -NumericScalar: TypeAlias = ( +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( lib.Scalar[lib.Int8Type] | lib.Scalar[lib.Int16Type] | lib.Scalar[lib.Int32Type] | lib.Scalar[lib.Int64Type] - | lib.Scalar[lib.Uint8Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Uint8Type] | lib.Scalar[lib.Uint16Type] | lib.Scalar[lib.Uint32Type] | lib.Scalar[lib.Uint64Type] - | lib.Scalar[lib.Float16Type] - | lib.Scalar[lib.Float32Type] - | lib.Scalar[lib.Float64Type] - | lib.Scalar[lib.Decimal128Type] - | lib.Scalar[lib.Decimal256Type] ) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = lib.Scalar[lib.Decimal128Type] | lib.Scalar[lib.Decimal256Type] +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar BinaryScalar: TypeAlias = ( lib.Scalar[lib.BinaryType] | lib.Scalar[lib.LargeBinaryType] | lib.Scalar[lib.FixedSizeBinaryType] ) StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar ListScalar: TypeAlias = ( lib.ListScalar[_DataTypeT] | lib.LargeListScalar[_DataTypeT] @@ -131,73 +142,35 @@ TemporalScalar: TypeAlias = ( | lib.DurationScalar | lib.MonthDayNanoIntervalScalar ) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar -_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + _NumericOrTemporalT = TypeVar("_NumericOrTemporalT", bound=NumericOrTemporalScalar) -NumericArray: TypeAlias = lib.NumericArray[_ScalarT] | lib.ChunkedArray[_ScalarT] -_NumericArrayT = TypeVar("_NumericArrayT", bound=lib.NumericArray) -NumericOrDurationArray: TypeAlias = ( - lib.NumericArray | lib.Array[lib.DurationScalar] | lib.ChunkedArray -) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) -NumericOrTemporalArray: TypeAlias = ( - lib.NumericArray | lib.Array[TemporalScalar] | lib.ChunkedArray[TemporalScalar] -) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalT] _NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) -BooleanArray: TypeAlias = lib.BooleanArray | lib.ChunkedArray[lib.BooleanScalar] -FloatScalar: TypeAlias = lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] -DecimalScalar: TypeAlias = lib.Scalar[lib.Decimal128Type] | lib.Scalar[lib.Decimal256Type] +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] _FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) -FloatArray: TypeAlias = ( - lib.NumericArray[lib.FloatScalar] - | lib.NumericArray[lib.DoubleScalar] - | lib.ChunkedArray[lib.FloatScalar] - | lib.ChunkedArray[lib.DoubleScalar] -) - +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] _FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) _StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) -StringArray: TypeAlias = ( - lib.StringArray - | lib.LargeStringArray - | lib.ChunkedArray[lib.StringScalar] - | lib.ChunkedArray[lib.LargeStringScalar] -) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] _StringArrayT = TypeVar("_StringArrayT", bound=StringArray) _BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) -BinaryArray: TypeAlias = ( - lib.BinaryArray - | lib.LargeBinaryArray - | lib.ChunkedArray[lib.BinaryScalar] - | lib.ChunkedArray[lib.LargeBinaryScalar] -) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] _BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) -StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar _StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) StringOrBinaryArray: TypeAlias = StringArray | BinaryArray _StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) _TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) -TemporalArray: TypeAlias = ( - lib.Date32Array - | lib.Date64Array - | lib.Time32Array - | lib.Time64Array - | lib.TimestampArray - | lib.DurationArray - | lib.MonthDayNanoIntervalArray - | lib.ChunkedArray[lib.Date32Scalar] - | lib.ChunkedArray[lib.Date64Scalar] - | lib.ChunkedArray[lib.Time32Scalar] - | lib.ChunkedArray[lib.Time64Scalar] - | lib.ChunkedArray[lib.DurationScalar] - | lib.ChunkedArray[lib.MonthDayNanoIntervalScalar] -) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] _TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) -_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) -_ArrayT = TypeVar("_ArrayT", bound=lib.Array) -_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar) # =============================== 1. Aggregation =============================== # ========================= 1.1 functions ========================= @@ -940,18 +913,11 @@ not_equal = _clone_signature(equal) @overload def max_element_wise( - *args: _ScalarT, - skip_nulls: bool = True, - options: ElementWiseAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: ... -@overload -def max_element_wise( - *args: _ArrayT, + *args: ScalarOrArray[_Scalar_CoT], skip_nulls: bool = True, options: ElementWiseAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _ArrayT: ... +) -> _Scalar_CoT: ... @overload def max_element_wise( *args: Expression, @@ -960,7 +926,7 @@ def max_element_wise( memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... -min_element_wise = _clone_signature(equal) +min_element_wise = _clone_signature(max_element_wise) # ========================= 2.6 Logical functions ========================= @overload From e9b6405cd151b38e2c6ea8e759e089ce795e4630 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 4 Mar 2025 03:53:20 +0000 Subject: [PATCH 155/231] fix: add `pc.fill_null` (#185) - https://arrow.apache.org/docs/python/generated/pyarrow.compute.fill_null.html - https://github.com/narwhals-dev/narwhals/blob/05e47b27ebe27b24196cee5956d07748d65a62ee/narwhals/_arrow/series.py#L675 --- pyarrow-stubs/compute.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index db288d49ab1..f8442f8d664 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1792,6 +1792,9 @@ def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... def coalesce( *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None ) -> _ScalarOrArrayT: ... + +fill_null = coalesce + def if_else( cond: ArrayLike | ScalarLike, left: ArrayLike | ScalarLike, From f858257eeb1cce3fcd76af036bb094d8148016ef Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:07:02 +1100 Subject: [PATCH 156/231] fix: Allow Table.from_arrays to take a list containing a mix of Array and ChunkedArray (#187) Update table.pyi --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 3346239820e..d78dd573e71 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -465,7 +465,7 @@ class RecordBatch(_Tabular[Array]): @classmethod def from_arrays( cls, - arrays: Collection[Array] | Collection[ChunkedArray], + arrays: Collection[Array | ChunkedArray], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, From c41da36b45475392106d20edcd710e616c9ec2df Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 17 Mar 2025 11:09:50 +0800 Subject: [PATCH 157/231] release 17.18 (#188) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 7180f2960d8..93cfa4c0271 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.17' - sha256: 96e9ee04674e9b9b12604c7c7952872e23624bb8de4b03d29a20abd5d58c90d8 + version: '17.18' + sha256: 1b33e996299e98c8db6a9a9b1dc282ef2dbb787364331a289fd9d15b662a2391 requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 1c27886f6ea..3de26cc16f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.17" +version = "17.18" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From dda1d1d92e91fd9b4dde85cf158e63eebdc2ceae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:11:05 +0800 Subject: [PATCH 158/231] [pre-commit.ci] pre-commit autoupdate (#180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.6 → v0.9.10](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.6...v0.9.10) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5a7cd9b68bf..b69291f1dc4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.6 + rev: v0.9.10 hooks: - id: ruff args: [--fix] From bd918229079076a224684d6dc75fea553ab75f97 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Tue, 18 Mar 2025 01:06:49 +1100 Subject: [PATCH 159/231] fix: from_arrays for both Table and RecordBatch (#189) --- pyarrow-stubs/__lib_pxi/table.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index d78dd573e71..95c05201e66 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -465,7 +465,7 @@ class RecordBatch(_Tabular[Array]): @classmethod def from_arrays( cls, - arrays: Collection[Array | ChunkedArray], + arrays: Collection[Array], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, @@ -546,7 +546,7 @@ class Table(_Tabular[ChunkedArray]): @classmethod def from_arrays( cls, - arrays: Collection[Array] | Collection[ChunkedArray], + arrays: Collection[Array | ChunkedArray], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, From 93f756d39cc1cb0470c244863f269b0d513d19ea Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:07:32 +0000 Subject: [PATCH 160/231] fix: resolve some `pa.compute` overlaps (#184) * fix: resolve overlapping `compute.(add|divide)` * fix: copy from non-cloned signature * fix: resolve overlapping `compute.exp` * fix: resolve overlapping `compute.power` * fix: resolve overlapping `compute.equal` * fix: resolve overlapping `compute.and_` --- pyarrow-stubs/compute.pyi | 208 +++++++++++++++++++++++++++++--------- 1 file changed, 162 insertions(+), 46 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index f8442f8d664..8b295631e90 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -118,6 +118,7 @@ FloatScalar: TypeAlias = ( lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] ) DecimalScalar: TypeAlias = lib.Scalar[lib.Decimal128Type] | lib.Scalar[lib.Decimal256Type] +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar BinaryScalar: TypeAlias = ( lib.Scalar[lib.BinaryType] @@ -145,14 +146,14 @@ TemporalScalar: TypeAlias = ( NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar -_NumericOrTemporalT = TypeVar("_NumericOrTemporalT", bound=NumericOrTemporalScalar) +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) _NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) -NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalT] +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] _NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] @@ -368,8 +369,12 @@ abs_checked = _clone_signature(abs) @overload def add( - x: _NumericOrTemporalT, y: _NumericOrTemporalT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrTemporalT: ... + x: _NumericOrTemporalScalarT, + y: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... @overload def add( x: _NumericOrTemporalArrayT, @@ -380,55 +385,87 @@ def add( ) -> _NumericOrTemporalArrayT: ... @overload def add( - x: NumericScalar, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> NumericScalar: ... + x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... @overload def add( - x: TemporalScalar, y: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> TemporalScalar: ... + x: NumericOrTemporalScalar, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... @overload def add( - x: NumericOrTemporalArray | NumericOrTemporalScalar, - y: NumericOrTemporalArray | NumericOrTemporalScalar, + x: _NumericOrTemporalArrayT, + y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None, -) -> NumericOrTemporalArray: ... +) -> _NumericOrTemporalArrayT: ... @overload def add( - x: Expression | Any, y: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None + x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... add_checked = _clone_signature(add) @overload def divide( - dividend: NumericScalar, - divisor: NumericScalar, + dividend: _NumericOrTemporalScalarT, + divisor: _NumericOrTemporalScalarT, /, *, memory_pool: lib.MemoryPool | None = None, -) -> NumericScalar: ... +) -> _NumericOrTemporalScalarT: ... @overload def divide( - dividend: TemporalScalar, - divisor: TemporalScalar, + dividend: _NumericOrTemporalArrayT, + divisor: _NumericOrTemporalArrayT, /, *, memory_pool: lib.MemoryPool | None = None, -) -> TemporalScalar: ... +) -> _NumericOrTemporalArrayT: ... @overload def divide( - dividend: NumericOrTemporalArray | NumericOrTemporalScalar, - divisor: NumericOrTemporalArray | NumericOrTemporalScalar, + dividend: Expression, + divisor: Expression, /, *, memory_pool: lib.MemoryPool | None = None, -) -> NumericArray: ... +) -> Expression: ... @overload def divide( - dividend: Expression | Any, - divisor: Expression | Any, + dividend: NumericOrTemporalScalar, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: Expression, + divisor: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None, @@ -438,25 +475,28 @@ divide_checked = _clone_signature(divide) @overload def exp( - exponent: lib.FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatArray: ... + exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT: ... @overload def exp( - exponent: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None + exponent: ArrayOrChunkedArray[NonFloatNumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> lib.DoubleArray: ... @overload def exp( - exponent: lib.FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar: ... + exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatScalarT: ... @overload def exp( - exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None + exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.DoubleScalar: ... @overload def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... multiply = _clone_signature(add) -multiply_checked = _clone_signature(multiply) +multiply_checked = _clone_signature(add) @overload def negate( @@ -481,11 +521,31 @@ def power( ) -> _NumericScalarT: ... @overload def power( - base: NumericScalar, exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> NumericScalar: ... + base: _NumericArrayT, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: Expression, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... @overload def power( base: _NumericArrayT, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, exponent: _NumericArrayT, /, *, @@ -493,16 +553,16 @@ def power( ) -> _NumericArrayT: ... @overload def power( - base: NumericScalar | NumericArray, - exponent: NumericScalar | NumericArray, + base: NumericScalar, + exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None, -) -> NumericArray: ... +) -> Expression: ... @overload def power( - base: Expression | Any, - exponent: Expression | Any, + base: Expression, + exponent: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None, @@ -534,7 +594,7 @@ def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expr sqrt_checked = _clone_signature(sqrt) subtract = _clone_signature(add) -subtract_checked = _clone_signature(subtract) +subtract_checked = _clone_signature(add) # ========================= 2.1 Bit-wise functions ========================= @overload @@ -890,16 +950,40 @@ def equal( ) -> lib.BooleanScalar: ... @overload def equal( - x: lib.Scalar | lib.Array | lib.ChunkedArray, - y: lib.Scalar | lib.Array | lib.ChunkedArray, + x: lib.Scalar, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload def equal( - x: Expression | Any, - y: Expression | Any, + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: lib.Scalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: Expression, + y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None, @@ -935,16 +1019,48 @@ def and_( ) -> lib.BooleanScalar: ... @overload def and_( - x: lib.BooleanScalar | BooleanArray, - y: lib.BooleanScalar | BooleanArray, + x: BooleanArray, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: lib.BooleanScalar, + y: BooleanArray, /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload def and_( - x: Expression | Any, - y: Expression | Any, + x: BooleanArray, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: lib.BooleanScalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: Expression, + y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None, From ecad016671777e8e2f9184df2e116ee1508afefe Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:08:17 +0000 Subject: [PATCH 161/231] fix: Include `Array` in `chunked_array` overload (#190) https://github.com/narwhals-dev/narwhals/pull/2113/commits/0237f7a97abd1f818b20638130d111904d6578f6 --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 95c05201e66..77dcbd744a4 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -202,7 +202,7 @@ def chunked_array( ) -> ChunkedArray[scalar.ListScalar]: ... @overload def chunked_array( - values: Iterable[_Scalar_CoT], + values: Iterable[_Scalar_CoT] | Iterable[Array[_Scalar_CoT]], type: None = None, ) -> ChunkedArray[_Scalar_CoT]: ... @overload From 90d373de8bbb99137c1071b18f8597f35733eb8a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 17 Mar 2025 22:14:19 +0800 Subject: [PATCH 162/231] release 17.19 (#191) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 93cfa4c0271..3fe618fedbe 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1165,8 +1165,8 @@ packages: requires_python: '>=3.8' - pypi: . name: pyarrow-stubs - version: '17.18' - sha256: 1b33e996299e98c8db6a9a9b1dc282ef2dbb787364331a289fd9d15b662a2391 + version: '17.19' + sha256: 40f1b52d277c0317b1a6c4fa404e2fcebcaa5be0927bdccd415b3cc606e3dbaa requires_dist: - pyarrow>=17 requires_python: '>=3.8,<4' diff --git a/pyproject.toml b/pyproject.toml index 3de26cc16f6..7343c2a6480 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.18" +version = "17.19" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 8f779099262a3923f1ce2cef7b53770bda45441e Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Wed, 2 Apr 2025 07:46:21 +0200 Subject: [PATCH 163/231] Add Scalar, Array and Type classes for Json & Uuid (#194) * Add Scalar, Array and Type classes for Json & Uuid * Formatting fixes --- pyarrow-stubs/__lib_pxi/array.pyi | 5 +++++ pyarrow-stubs/__lib_pxi/scalar.pyi | 12 +++++++++++- pyarrow-stubs/__lib_pxi/types.pyi | 5 +++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 55b71d2d9d7..5cf5521b24e 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1587,6 +1587,9 @@ class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): typ: types.BaseExtensionType, storage: _ArrayT ) -> ExtensionArray[_ArrayT]: ... +class JsonArray(ExtensionArray[_ArrayT]): ... +class UuidArray(ExtensionArray[_ArrayT]): ... + class FixedShapeTensorArray(ExtensionArray[_ArrayT]): def to_numpy_ndarray(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -1648,6 +1651,8 @@ __all__ = [ "StructArray", "RunEndEncodedArray", "ExtensionArray", + "JsonArray", + "UuidArray", "FixedShapeTensorArray", "concat_arrays", "_empty_array", diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index f55b1832b27..7d23f783769 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -4,6 +4,7 @@ import datetime as dt import sys from decimal import Decimal +from uuid import UUID if sys.version_info >= (3, 11): from typing import Self @@ -18,7 +19,7 @@ from typing import Any, Generic, Iterator, Mapping, overload import numpy as np from pyarrow._compute import CastOptions -from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, UuidType, _Weakrefable from typing_extensions import TypeVar from . import types @@ -60,6 +61,8 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @overload + def as_py(self: Scalar[types.ExtensionType]) -> Any: ... + @overload def as_py(self: Scalar[types._BasicDataType[_AsPyType]]) -> _AsPyType: ... @overload def as_py( @@ -254,6 +257,11 @@ class ExtensionScalar(Scalar[types.ExtensionType]): @staticmethod def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... +class JsonScalar(ExtensionScalar): ... + +class UuidScalar(ExtensionScalar): + def as_py(self: Scalar[UuidType]) -> UUID | None: ... + class FixedShapeTensorScalar(ExtensionScalar): def to_numpy(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -453,6 +461,8 @@ __all__ = [ "RunEndEncodedScalar", "UnionScalar", "ExtensionScalar", + "JsonScalar", + "UuidScalar", "FixedShapeTensorScalar", "scalar", ] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 049a3bce2d1..66a423e7bd2 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -229,6 +229,9 @@ class ExtensionType(BaseExtensionType): @classmethod def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: ... +class JsonType(BaseExtensionType): ... +class UuidType(BaseExtensionType): ... + class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): @property def value_type(self) -> _ValueT: ... @@ -653,6 +656,8 @@ __all__ = [ "RunEndEncodedType", "BaseExtensionType", "ExtensionType", + "JsonType", + "UuidType", "FixedShapeTensorType", "PyExtensionType", "UnknownExtensionType", From e36f4ec12894c67b88b10c6e98b8e7fd2ae471ad Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Apr 2025 13:48:21 +0800 Subject: [PATCH 164/231] [pre-commit.ci] pre-commit autoupdate (#192) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.10 → v0.11.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.10...v0.11.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b69291f1dc4..eca0c65f212 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.10 + rev: v0.11.2 hooks: - id: ruff args: [--fix] From 2dcbf664d85164f769149014c2f503afd649e94a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 2 Apr 2025 13:59:29 +0800 Subject: [PATCH 165/231] Revert "Add Scalar, Array and Type classes for Json & Uuid" (#195) Revert "Add Scalar, Array and Type classes for Json & Uuid (#194)" This reverts commit 8f779099262a3923f1ce2cef7b53770bda45441e. --- pyarrow-stubs/__lib_pxi/array.pyi | 5 ----- pyarrow-stubs/__lib_pxi/scalar.pyi | 12 +----------- pyarrow-stubs/__lib_pxi/types.pyi | 5 ----- 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 5cf5521b24e..55b71d2d9d7 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1587,9 +1587,6 @@ class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): typ: types.BaseExtensionType, storage: _ArrayT ) -> ExtensionArray[_ArrayT]: ... -class JsonArray(ExtensionArray[_ArrayT]): ... -class UuidArray(ExtensionArray[_ArrayT]): ... - class FixedShapeTensorArray(ExtensionArray[_ArrayT]): def to_numpy_ndarray(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -1651,8 +1648,6 @@ __all__ = [ "StructArray", "RunEndEncodedArray", "ExtensionArray", - "JsonArray", - "UuidArray", "FixedShapeTensorArray", "concat_arrays", "_empty_array", diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 7d23f783769..f55b1832b27 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -4,7 +4,6 @@ import datetime as dt import sys from decimal import Decimal -from uuid import UUID if sys.version_info >= (3, 11): from typing import Self @@ -19,7 +18,7 @@ from typing import Any, Generic, Iterator, Mapping, overload import numpy as np from pyarrow._compute import CastOptions -from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, UuidType, _Weakrefable +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable from typing_extensions import TypeVar from . import types @@ -61,8 +60,6 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @overload - def as_py(self: Scalar[types.ExtensionType]) -> Any: ... - @overload def as_py(self: Scalar[types._BasicDataType[_AsPyType]]) -> _AsPyType: ... @overload def as_py( @@ -257,11 +254,6 @@ class ExtensionScalar(Scalar[types.ExtensionType]): @staticmethod def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... -class JsonScalar(ExtensionScalar): ... - -class UuidScalar(ExtensionScalar): - def as_py(self: Scalar[UuidType]) -> UUID | None: ... - class FixedShapeTensorScalar(ExtensionScalar): def to_numpy(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -461,8 +453,6 @@ __all__ = [ "RunEndEncodedScalar", "UnionScalar", "ExtensionScalar", - "JsonScalar", - "UuidScalar", "FixedShapeTensorScalar", "scalar", ] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 66a423e7bd2..049a3bce2d1 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -229,9 +229,6 @@ class ExtensionType(BaseExtensionType): @classmethod def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: ... -class JsonType(BaseExtensionType): ... -class UuidType(BaseExtensionType): ... - class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): @property def value_type(self) -> _ValueT: ... @@ -656,8 +653,6 @@ __all__ = [ "RunEndEncodedType", "BaseExtensionType", "ExtensionType", - "JsonType", - "UuidType", "FixedShapeTensorType", "PyExtensionType", "UnknownExtensionType", From 3cafcc7037e53515a1790a54cee68f11ba8adf5a Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 6 Apr 2025 01:19:03 +0100 Subject: [PATCH 166/231] fix: Add missing `pc.equal` overload (#196) --- pyarrow-stubs/compute.pyi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 8b295631e90..32aa2eafc03 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -965,6 +965,14 @@ def equal( memory_pool: lib.MemoryPool | None = None, ) -> lib.BooleanArray: ... @overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload def equal( x: Expression, y: Expression, From 24a9cb53fa43520b71c6d51cf63e03b81ddab89e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 7 Apr 2025 10:53:26 +0800 Subject: [PATCH 167/231] feat: support pyarrow 19.0 (#198) * build: upgrade pyarrow min version to 19.0 * feat: support pyarrow 19.0 * omit mypy bool8 override error --- pixi.lock | 52 ++++++++++++++---------------- pyarrow-stubs/__init__.pyi | 4 +++ pyarrow-stubs/__lib_pxi/array.pyi | 38 ++++++++++++++++++++-- pyarrow-stubs/__lib_pxi/scalar.pyi | 16 +++++++++ pyarrow-stubs/__lib_pxi/table.pyi | 12 ++++++- pyarrow-stubs/__lib_pxi/types.pyi | 46 ++++++++++++++++++++++++++ pyarrow-stubs/_cuda.pyi | 4 +++ pyarrow-stubs/_dataset.pyi | 5 ++- pyarrow-stubs/_flight.pyi | 23 +++++++++++-- pyarrow-stubs/_parquet.pyi | 2 ++ pyarrow-stubs/_substrait.pyi | 13 +++++++- pyarrow-stubs/compute.pyi | 7 +++- pyarrow-stubs/pandas_compat.pyi | 3 ++ pyarrow-stubs/substrait.pyi | 6 ++++ pyarrow-stubs/types.pyi | 2 ++ pyarrow-stubs/util.pyi | 2 ++ pyproject.toml | 10 +++--- 17 files changed, 202 insertions(+), 43 deletions(-) diff --git a/pixi.lock b/pixi.lock index 3fe618fedbe..91c3b168307 100644 --- a/pixi.lock +++ b/pixi.lock @@ -58,7 +58,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl @@ -119,7 +119,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl @@ -180,7 +180,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl @@ -242,7 +242,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl @@ -1115,61 +1115,57 @@ packages: sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 requires_dist: - pytest ; extra == 'tests' -- pypi: https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl +- pypi: https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl name: pyarrow - version: 17.0.0 - sha256: a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03 + version: 19.0.1 + sha256: 7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 requires_dist: - - numpy>=1.16.6 - pytest ; extra == 'test' - hypothesis ; extra == 'test' - cffi ; extra == 'test' - pytz ; extra == 'test' - pandas ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl name: pyarrow - version: 17.0.0 - sha256: e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4 + version: 19.0.1 + sha256: cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 requires_dist: - - numpy>=1.16.6 - pytest ; extra == 'test' - hypothesis ; extra == 'test' - cffi ; extra == 'test' - pytz ; extra == 'test' - pandas ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl name: pyarrow - version: 17.0.0 - sha256: 2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3 + version: 19.0.1 + sha256: 49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 requires_dist: - - numpy>=1.16.6 - pytest ; extra == 'test' - hypothesis ; extra == 'test' - cffi ; extra == 'test' - pytz ; extra == 'test' - pandas ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl name: pyarrow - version: 17.0.0 - sha256: 1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977 + version: 19.0.1 + sha256: 008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 requires_dist: - - numpy>=1.16.6 - pytest ; extra == 'test' - hypothesis ; extra == 'test' - cffi ; extra == 'test' - pytz ; extra == 'test' - pandas ; extra == 'test' - requires_python: '>=3.8' + requires_python: '>=3.9' - pypi: . name: pyarrow-stubs - version: '17.19' - sha256: 40f1b52d277c0317b1a6c4fa404e2fcebcaa5be0927bdccd415b3cc606e3dbaa + version: '19.0' + sha256: 11d773de16a71722518e21d5695a44c2193d5af6bed94c8788c6ccdfdd00a049 requires_dist: - - pyarrow>=17 - requires_python: '>=3.8,<4' + - pyarrow>=19 + requires_python: '>=3.9,<4' editable: true - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl name: pygments diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index 6f1edbc6d5e..11c20c53042 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -89,6 +89,8 @@ from pyarrow.lib import ( Time64Type, DurationType, FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, Decimal128Type, Decimal256Type, BaseExtensionType, @@ -403,6 +405,8 @@ __all__ = [ "Time64Type", "DurationType", "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", "Decimal128Type", "Decimal256Type", "BaseExtensionType", diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 55b71d2d9d7..08eb41bbcd7 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -38,7 +38,15 @@ from pyarrow._stubs_typing import ( SupportArrowArray, SupportArrowDeviceArray, ) -from pyarrow.lib import Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + MonthDayNano, + Tensor, + _Weakrefable, +) from . import scalar, types from .device import DeviceAllocationType @@ -673,6 +681,14 @@ def nulls( size: int, types: types.Float64Type, memory_pool: MemoryPool | None = None ) -> DoubleArray: ... @overload +def nulls( + size: int, types: types.Decimal32Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, types: types.Decimal64Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload def nulls( size: int, types: types.Decimal128Type, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @@ -1165,6 +1181,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @property def offset(self) -> int: ... def buffers(self) -> list[Buffer | None]: ... + def copy_to(self, destination: MemoryManager | Device) -> Self: ... def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... @classmethod def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... @@ -1214,6 +1231,8 @@ class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... +class Decimal32Array(FixedSizeBinaryArray): ... +class Decimal64Array(FixedSizeBinaryArray): ... class Decimal128Array(FixedSizeBinaryArray): ... class Decimal256Array(FixedSizeBinaryArray): ... @@ -1536,6 +1555,7 @@ class StructArray(Array[scalar.StructScalar]): fields: list[Field] | None = None, mask=None, memory_pool: MemoryPool | None = None, + type: types.StructType | None = None, ) -> StructArray: ... def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: ... @@ -1587,11 +1607,23 @@ class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): typ: types.BaseExtensionType, storage: _ArrayT ) -> ExtensionArray[_ArrayT]: ... +class JsonArray(ExtensionArray[_ArrayT]): ... +class UuidArray(ExtensionArray[_ArrayT]): ... + class FixedShapeTensorArray(ExtensionArray[_ArrayT]): def to_numpy_ndarray(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... - @staticmethod - def from_numpy_ndarray(obj: np.ndarray) -> FixedShapeTensorArray: ... + @classmethod + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: ... + +class OpaqueArray(ExtensionArray[_ArrayT]): ... + +class Bool8Array(ExtensionArray): + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: ... + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: ... def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: ... def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: ... diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index f55b1832b27..7dde6b0b1d3 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -120,6 +120,8 @@ class Int64Scalar(Scalar[types.Int64Type]): ... class HalfFloatScalar(Scalar[types.Float16Type]): ... class FloatScalar(Scalar[types.Float32Type]): ... class DoubleScalar(Scalar[types.Float64Type]): ... +class Decimal32Scalar(Scalar[types.Decimal32Type]): ... +class Decimal64Scalar(Scalar[types.Decimal64Type]): ... class Decimal128Scalar(Scalar[types.Decimal128Type]): ... class Decimal256Scalar(Scalar[types.Decimal256Type]): ... class Date32Scalar(Scalar[types.Date32Type]): ... @@ -352,6 +354,20 @@ def scalar( memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Float64Type]]: ... @overload +def scalar( + value: CollectionValue[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal32Type]]: ... +@overload +def scalar( + value: CollectionValue[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal64Type]]: ... +@overload def scalar( value: CollectionValue[Decimal], *, diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 77dcbd744a4..6a262181945 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -44,7 +44,7 @@ from pyarrow._stubs_typing import ( ) from pyarrow.compute import Expression from pyarrow.interchange.dataframe import _PyArrowDataFrame -from pyarrow.lib import Field, MemoryPool, MonthDayNano, Schema +from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema from . import scalar from .array import Array, StructArray, _CastAs, _PandasConvertible @@ -145,6 +145,8 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): def __arrow_c_stream__(self, requested_schema=None) -> Any: ... @classmethod def _import_from_c_capsule(cls, stream) -> Self: ... + @property + def is_cpu(self) -> bool: ... @overload def chunked_array( @@ -507,6 +509,7 @@ class RecordBatch(_Tabular[Array]): def device_type(self) -> DeviceAllocationType: ... @property def is_cpu(self) -> bool: ... + def copy_to(self, destination: MemoryManager | Device) -> Self: ... def table_to_blocks(options, table: Table, categories, extension_columns): ... @@ -594,6 +597,8 @@ class Table(_Tabular[ChunkedArray]): right_by: str | list[str] | None = None, ) -> Self: ... def __arrow_c_stream__(self, requested_schema=None): ... + @property + def is_cpu(self) -> bool: ... def record_batch( data: dict[str, list | Array] @@ -641,6 +646,10 @@ class TableGroupBy: @property def _use_threads(self) -> bool: ... +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: ... + __all__ = [ "ChunkedArray", "chunked_array", @@ -652,4 +661,5 @@ __all__ = [ "table", "concat_tables", "TableGroupBy", + "concat_batches", ] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 049a3bce2d1..391bd5db9e0 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -111,6 +111,18 @@ class FixedSizeBinaryType(_BasicDataType[Decimal]): ... _Precision = TypeVar("_Precision") _Scale = TypeVar("_Scale") +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + @property + def scale(self) -> _Scale: ... + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + @property + def scale(self) -> _Scale: ... + class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): @property def precision(self) -> _Precision: ... @@ -185,6 +197,10 @@ class StructType(DataType): def __len__(self) -> int: ... def __iter__(self) -> Iterator[Field]: ... __getitem__ = field + @property + def names(self) -> list[str]: ... + @property + def fields(self) -> list[Field]: ... class UnionType(DataType): @property @@ -239,6 +255,16 @@ class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): @property def permutation(self) -> list[int] | None: ... +class Bool8Type(BaseExtensionType): ... +class UuidType(BaseExtensionType): ... +class JsonType(BaseExtensionType): ... + +class OpaqueType(BaseExtensionType): + @property + def type_name(self) -> str: ... + @property + def vendor_name(self) -> str: ... + class PyExtensionType(ExtensionType): def __init__(self, storage_type: DataType) -> None: ... @classmethod @@ -367,6 +393,14 @@ def float16() -> Float16Type: ... def float32() -> Float32Type: ... def float64() -> Float64Type: ... @overload +def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... +@overload +def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... +@overload +def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... +@overload +def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... +@overload def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... @overload def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... @@ -445,12 +479,16 @@ def union( def run_end_encoded( run_end_type: _RunEndType, value_type: _BasicValueT ) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... +def json_(storage_type: DataType = ...) -> JsonType: ... +def uuid() -> UuidType: ... def fixed_shape_tensor( value_type: _ValueT, shape: Sequence[int], dim_names: Sequence[str] | None = None, permutation: Sequence[int] | None = None, ) -> FixedShapeTensorType[_ValueT]: ... +def bool8() -> Bool8Type: ... +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... @overload def type_for_alias(name: Literal["null"]) -> NullType: ... @overload @@ -636,6 +674,8 @@ __all__ = [ "Time64Type", "DurationType", "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", "Decimal128Type", "Decimal256Type", "ListType", @@ -686,6 +726,8 @@ __all__ = [ "float16", "float32", "float64", + "decimal32", + "decimal64", "decimal128", "decimal256", "string", @@ -707,7 +749,11 @@ __all__ = [ "dense_union", "union", "run_end_encoded", + "json_", + "uuid", "fixed_shape_tensor", + "bool8", + "opaque", "type_for_alias", "ensure_type", "schema", diff --git a/pyarrow-stubs/_cuda.pyi b/pyarrow-stubs/_cuda.pyi index 80a911b6f92..c7533b6621d 100644 --- a/pyarrow-stubs/_cuda.pyi +++ b/pyarrow-stubs/_cuda.pyi @@ -23,6 +23,10 @@ class Context(lib._Weakrefable): def bytes_allocated(self) -> int: ... def get_device_address(self, address: int) -> int: ... def new_buffer(self, nbytes: int) -> CudaBuffer: ... + @property + def memory_manager(self) -> lib.MemoryManager: ... + @property + def device(self) -> lib.Device: ... def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: ... def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... def buffer_from_data( diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index b6828593783..ebc450f4886 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -331,6 +331,7 @@ class JsonFragmentScanOptions(FragmentScanOptions): class Partitioning(lib._Weakrefable): def parse(self, path: str) -> Expression: ... + def format(self, expr: Expression) -> tuple[str, str]: ... @property def schema(self) -> lib.Schema: ... @@ -532,4 +533,6 @@ class _ScanNodeOptions(ExecNodeOptions): def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... class ScanNodeOptions(_ScanNodeOptions): - def __init__(self, dataset: Dataset, **kwargs) -> None: ... + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 5e23745e70c..74b561ca3db 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -22,6 +22,7 @@ from .lib import ( RecordBatchReader, Schema, Table, + TimestampScalar, _CRecordBatchWriter, _Weakrefable, ) @@ -145,12 +146,22 @@ class Location(_Weakrefable): def for_grpc_unix(path: str | bytes) -> Location: ... class FlightEndpoint(_Weakrefable): - def __init__(self, ticket: Ticket | str | bytes, locations: list[str | Location]): ... + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): ... @property def ticket(self) -> Ticket: ... @property def locations(self) -> list[Location]: ... def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: ... + @property + def app_metadata(self) -> bytes | str: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... @@ -168,8 +179,10 @@ class FlightInfo(_Weakrefable): schema: Schema, descriptor: FlightDescriptor, endpoints: list[FlightEndpoint], - total_records: int, - total_bytes: int, + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., ) -> None: ... @property def schema(self) -> Schema: ... @@ -181,6 +194,10 @@ class FlightInfo(_Weakrefable): def total_records(self) -> int: ... @property def total_bytes(self) -> int: ... + @property + def ordered(self) -> bool: ... + @property + def app_metadata(self) -> bytes | str: ... def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi index 03a6574a1e0..a9187df0428 100644 --- a/pyarrow-stubs/_parquet.pyi +++ b/pyarrow-stubs/_parquet.pyi @@ -192,6 +192,8 @@ class ColumnChunkMetaData(_Weakrefable): def has_offset_index(self) -> bool: ... @property def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... class _SortingColumn(TypedDict): column_index: int diff --git a/pyarrow-stubs/_substrait.pyi b/pyarrow-stubs/_substrait.pyi index 46de8d4110b..ff226e9521b 100644 --- a/pyarrow-stubs/_substrait.pyi +++ b/pyarrow-stubs/_substrait.pyi @@ -1,4 +1,4 @@ -from typing import Callable +from typing import Any, Callable from ._compute import Expression from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable @@ -10,6 +10,15 @@ def run_query( use_threads: bool = True, ) -> RecordBatchReader: ... def _parse_json_plan(plan: bytes) -> Buffer: ... + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... def serialize_expressions( exprs: list[Expression], names: list[str], @@ -23,6 +32,8 @@ class BoundExpressions(_Weakrefable): def schema(self) -> Schema: ... @property def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... def get_supported_functions() -> list[str]: ... diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 32aa2eafc03..b3256b71dab 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -117,7 +117,12 @@ IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar FloatScalar: TypeAlias = ( lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] ) -DecimalScalar: TypeAlias = lib.Scalar[lib.Decimal128Type] | lib.Scalar[lib.Decimal256Type] +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar BinaryScalar: TypeAlias = ( diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi index 453f48138f9..efbd05ac2fe 100644 --- a/pyarrow-stubs/pandas_compat.pyi +++ b/pyarrow-stubs/pandas_compat.pyi @@ -1,5 +1,6 @@ from typing import Any, TypedDict, TypeVar +import numpy as np import pandas as pd from pandas import DatetimeTZDtype @@ -10,6 +11,7 @@ _T = TypeVar("_T") def get_logical_type_map() -> dict[int, str]: ... def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... def get_logical_type_from_numpy(pandas_collection) -> str: ... def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... @@ -31,6 +33,7 @@ def construct_metadata( index_descriptors: list[dict], preserve_index: bool, types: list[DataType], + column_field_names: list[str] = ..., ) -> dict[bytes, bytes]: ... def dataframe_to_types( df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None diff --git a/pyarrow-stubs/substrait.pyi b/pyarrow-stubs/substrait.pyi index 860fe70b827..a56a8a5b40f 100644 --- a/pyarrow-stubs/substrait.pyi +++ b/pyarrow-stubs/substrait.pyi @@ -1,9 +1,12 @@ from pyarrow._substrait import ( BoundExpressions, + SubstraitSchema, deserialize_expressions, + deserialize_schema, get_supported_functions, run_query, serialize_expressions, + serialize_schema, ) __all__ = [ @@ -12,4 +15,7 @@ __all__ = [ "run_query", "deserialize_expressions", "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", ] diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi index 23f0a8984fe..ee89e497def 100644 --- a/pyarrow-stubs/types.pyi +++ b/pyarrow-stubs/types.pyi @@ -46,6 +46,8 @@ def is_date32(t: DataType) -> bool: ... def is_date64(t: DataType) -> bool: ... def is_map(t: DataType) -> bool: ... def is_decimal(t: DataType) -> bool: ... +def is_decimal32(t: DataType) -> bool: ... +def is_decimal64(t: DataType) -> bool: ... def is_decimal128(t: DataType) -> bool: ... def is_decimal256(t: DataType) -> bool: ... def is_dictionary(t: DataType) -> bool: ... diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi index 00d27837c04..c2ecf7d6b61 100644 --- a/pyarrow-stubs/util.pyi +++ b/pyarrow-stubs/util.pyi @@ -20,6 +20,8 @@ def get_contiguous_span( ) -> tuple[int, int]: ... def find_free_port() -> int: ... def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... def download_tzdata_on_windows() -> None: ... def _deprecate_api(old_name, new_name, api, next_version, type=...): ... def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... diff --git a/pyproject.toml b/pyproject.toml index 7343c2a6480..7fd9b038fce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "17.19" +version = "19.0" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" @@ -11,14 +11,14 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] -requires-python = ">=3.8,<4" -dependencies = ["pyarrow >=17"] +requires-python = ">=3.9,<4" +dependencies = ["pyarrow >=19"] [project.urls] homepage = "https://github.com/zen-xu/pyarrow-stubs" @@ -40,7 +40,7 @@ channels = ["conda-forge"] platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] [tool.pixi.dependencies] -python = "3.11" +python = "3.11.*" pip = "*" [tool.pixi.pypi-dependencies] From b71796af3fd3739f7d6b96bc828ebfa5f1afca2e Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 7 Apr 2025 11:15:21 +0800 Subject: [PATCH 168/231] fix: reexport new types (#199) --- pyarrow-stubs/__init__.pyi | 38 ++++++++++++++++++++++++++++++ pyarrow-stubs/__lib_pxi/array.pyi | 4 ++++ pyarrow-stubs/__lib_pxi/scalar.pyi | 11 +++++++++ pyarrow-stubs/__lib_pxi/types.pyi | 4 ++++ 4 files changed, 57 insertions(+) diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index 11c20c53042..f5a72f314dd 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -56,6 +56,8 @@ from pyarrow.lib import ( large_binary, large_string, large_utf8, + decimal32, + decimal64, decimal128, decimal256, list_, @@ -69,7 +71,11 @@ from pyarrow.lib import ( dense_union, dictionary, run_end_encoded, + json_, + uuid, fixed_shape_tensor, + bool8, + opaque, field, type_for_alias, DataType, @@ -97,6 +103,10 @@ from pyarrow.lib import ( ExtensionType, RunEndEncodedType, FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, PyExtensionType, UnknownExtensionType, register_extension_type, @@ -164,6 +174,10 @@ from pyarrow.lib import ( ExtensionArray, RunEndEncodedArray, FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, scalar, NA, _NULL as NULL, @@ -181,6 +195,8 @@ from pyarrow.lib import ( HalfFloatScalar, FloatScalar, DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, Decimal128Scalar, Decimal256Scalar, ListScalar, @@ -208,6 +224,10 @@ from pyarrow.lib import ( UnionScalar, RunEndEncodedScalar, ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, ) # Buffers, allocation @@ -372,6 +392,8 @@ __all__ = [ "large_binary", "large_string", "large_utf8", + "decimal32", + "decimal64", "decimal128", "decimal256", "list_", @@ -385,7 +407,11 @@ __all__ = [ "dense_union", "dictionary", "run_end_encoded", + "json_", + "uuid", "fixed_shape_tensor", + "bool8", + "opaque", "field", "type_for_alias", "DataType", @@ -413,6 +439,10 @@ __all__ = [ "ExtensionType", "RunEndEncodedType", "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", "PyExtensionType", "UnknownExtensionType", "register_extension_type", @@ -478,6 +508,10 @@ __all__ = [ "Decimal256Array", "StructArray", "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", "RunEndEncodedArray", "FixedShapeTensorArray", "scalar", @@ -524,6 +558,10 @@ __all__ = [ "UnionScalar", "RunEndEncodedScalar", "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", "DeviceAllocationType", "Device", "MemoryManager", diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 08eb41bbcd7..754667008d0 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1680,6 +1680,10 @@ __all__ = [ "StructArray", "RunEndEncodedArray", "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", "FixedShapeTensorArray", "concat_arrays", "_empty_array", diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 7dde6b0b1d3..51b7860fbbb 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -256,6 +256,11 @@ class ExtensionScalar(Scalar[types.ExtensionType]): @staticmethod def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... +class Bool8Scalar(Scalar[types.Bool8Type]): ... +class UuidScalar(Scalar[types.UuidType]): ... +class JsonScalar(Scalar[types.JsonType]): ... +class OpaqueScalar(Scalar[types.OpaqueType]): ... + class FixedShapeTensorScalar(ExtensionScalar): def to_numpy(self) -> np.ndarray: ... def to_tensor(self) -> Tensor: ... @@ -442,6 +447,8 @@ __all__ = [ "HalfFloatScalar", "FloatScalar", "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", "Decimal128Scalar", "Decimal256Scalar", "Date32Scalar", @@ -470,5 +477,9 @@ __all__ = [ "UnionScalar", "ExtensionScalar", "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", "scalar", ] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 391bd5db9e0..8d417008780 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -694,6 +694,10 @@ __all__ = [ "BaseExtensionType", "ExtensionType", "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", "PyExtensionType", "UnknownExtensionType", "register_extension_type", From 6d726a20122ab624e242c6ab7b44240235ecd691 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 7 Apr 2025 11:32:33 +0800 Subject: [PATCH 169/231] feat: override new patterns for func repeat and nulls (#200) * fix: reexport decimal64 array and decimal128 array * feat: override new patterns for func `repeat` and `nulls` --- pyarrow-stubs/__init__.pyi | 6 ++++ pyarrow-stubs/__lib_pxi/array.pyi | 58 +++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi index f5a72f314dd..8a0d1e870c5 100644 --- a/pyarrow-stubs/__init__.pyi +++ b/pyarrow-stubs/__init__.pyi @@ -168,6 +168,8 @@ from pyarrow.lib import ( Time64Array, DurationArray, MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, Decimal128Array, Decimal256Array, StructArray, @@ -504,6 +506,8 @@ __all__ = [ "Time64Array", "DurationArray", "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", "Decimal128Array", "Decimal256Array", "StructArray", @@ -531,6 +535,8 @@ __all__ = [ "HalfFloatScalar", "FloatScalar", "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", "Decimal128Scalar", "Decimal256Scalar", "ListScalar", diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 754667008d0..fa237d7a270 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -833,6 +833,30 @@ def nulls( memory_pool: MemoryPool | None = None, ) -> FixedShapeTensorArray: ... @overload +def nulls( + size: int, + types: types.Bool8Type, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def nulls( + size: int, + types: types.UuidType, + memory_pool: MemoryPool | None = None, +) -> UuidArray: ... +@overload +def nulls( + size: int, + types: types.JsonType, + memory_pool: MemoryPool | None = None, +) -> JsonArray: ... +@overload +def nulls( + size: int, + types: types.OpaqueType, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray: ... +@overload def nulls( size: int, types: types.ExtensionType, @@ -891,6 +915,14 @@ def repeat( value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None ) -> DoubleArray: ... @overload +def repeat( + value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal32Array: ... +@overload +def repeat( + value: Decimal | scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal64Array: ... +@overload def repeat( value: Decimal | scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @@ -1037,6 +1069,30 @@ def repeat( memory_pool: MemoryPool | None = None, ) -> FixedShapeTensorArray: ... @overload +def repeat( + value: scalar.Bool8Scalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def repeat( + value: scalar.UuidScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UuidArray: ... +@overload +def repeat( + value: scalar.JsonScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> JsonArray: ... +@overload +def repeat( + value: scalar.OpaqueScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray: ... +@overload def repeat( value: scalar.ExtensionScalar, size: int, @@ -1660,6 +1716,8 @@ __all__ = [ "FloatArray", "DoubleArray", "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", "Decimal128Array", "Decimal256Array", "BaseListArray", From 02552b81161d19d4aa71d8656b028eefac84612b Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 7 Apr 2025 11:35:34 +0800 Subject: [PATCH 170/231] release: 19.1 (#201) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 91c3b168307..0aae52e6d1c 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1161,8 +1161,8 @@ packages: requires_python: '>=3.9' - pypi: . name: pyarrow-stubs - version: '19.0' - sha256: 11d773de16a71722518e21d5695a44c2193d5af6bed94c8788c6ccdfdd00a049 + version: '19.1' + sha256: 3d889ce8db1d2fb7079c6fa894e7582e1e08d9a6beed12b83ca84f2e1bdc9fbc requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 7fd9b038fce..72516b1e019 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "19.0" +version = "19.1" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 067c0427f33154a2f135f9764f531a36a7509f26 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:16:31 +0100 Subject: [PATCH 171/231] fix: Allow `Iterable[Table]` in `concat_tables` (#203) https://arrow.apache.org/docs/python/generated/pyarrow.concat_tables.html > tables : iterable of pyarrow.Table objects --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 6a262181945..73ca5eac06a 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -630,7 +630,7 @@ def table( nthreads: int | None = None, ) -> Table: ... def concat_tables( - tables: list[Table], + tables: Iterable[Table], memory_pool: MemoryPool | None = None, promote_options: Literal["none", "default", "permissive"] = "none", **kwargs, From a9d4e1ce14ac0074c614ee96a590e28a997ec0a0 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:18:43 +0100 Subject: [PATCH 172/231] fix: Allow `ChunkedArray[BooleanScalar]` in `pc.invert` (#204) Fixes https://github.com/narwhals-dev/narwhals/blob/caabc0efdef54f117c83888926860e3972ef69d5/narwhals/_arrow/series.py#L298-L299 --- pyarrow-stubs/compute.pyi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index b3256b71dab..dd2d4c28fd9 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -161,6 +161,7 @@ _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDu NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] _NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] _FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] @@ -1092,11 +1093,11 @@ def invert( ) -> lib.BooleanScalar: ... @overload def invert( - x: lib.BooleanArray, + x: _BooleanArrayT, /, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... +) -> _BooleanArrayT: ... @overload def invert( x: Expression, From d2b5891cfca7a53824ad995d12e66abfd30c9e0b Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:19:18 +0100 Subject: [PATCH 173/231] feat: Fully spec `TableGroupBy.aggregate` (#197) ## Related - https://arrow.apache.org/docs/python/compute.html#grouped-aggregations - https://arrow.apache.org/docs/python/generated/pyarrow.TableGroupBy.html#pyarrow.TableGroupBy.aggregate - https://github.com/apache/arrow/blob/34a984c842db42b409a1359e6e2cf167a2365a48/python/pyarrow/table.pxi#L6578-L6604 --- pyarrow-stubs/__lib_pxi/table.pyi | 70 ++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 73ca5eac06a..94680ee77e9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -30,7 +30,13 @@ from typing import ( import numpy as np import pandas as pd -from pyarrow._compute import CastOptions, FunctionOptions +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, +) from pyarrow._stubs_typing import ( Indices, Mask, @@ -57,6 +63,62 @@ from .types import _AsPyType, _BasicDataType, _DataType_CoT, _DataTypeT _Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @property def data(self) -> Self: ... @@ -640,7 +702,11 @@ class TableGroupBy: keys: str | list[str] def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... def aggregate( - self, aggregations: list[tuple[str, str]] | list[tuple[str, str, FunctionOptions]] + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], ) -> Table: ... def _table(self) -> Table: ... @property From 65a00f1d34007cc95b2053117ebb3e896437ffc2 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:19:43 +0100 Subject: [PATCH 174/231] fix: Add missing return type to `ChunkedArray.filter` (#205) --- pyarrow-stubs/__lib_pxi/table.pyi | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 94680ee77e9..0685b35a4cb 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -171,7 +171,9 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): def unique(self) -> ChunkedArray[_Scalar_CoT]: ... def value_counts(self) -> StructArray: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... - def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop"): ... + def filter( + self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: ... @overload def index( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], From b4233fe173ee084be9bb2691ef504dc319c81777 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:20:02 +0100 Subject: [PATCH 175/231] fix: Add relaxed final overload to logical functions (#206) Covers all of `pc.(and_ | and_kleene | and_not | and_not_kleene | or_ | or_kleene | xor)` Resolves: - https://github.com/narwhals-dev/narwhals/blob/caabc0efdef54f117c83888926860e3972ef69d5/narwhals/_arrow/series.py#L219-L233 - https://github.com/narwhals-dev/narwhals/blob/caabc0efdef54f117c83888926860e3972ef69d5/narwhals/_arrow/series.py#L662 --- pyarrow-stubs/compute.pyi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index dd2d4c28fd9..1d0f6436b12 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1079,6 +1079,14 @@ def and_( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +@overload +def and_( + x: ScalarOrArray[lib.BooleanScalar], + y: ScalarOrArray[lib.BooleanScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ScalarOrArray[lib.BooleanScalar]: ... and_kleene = _clone_signature(and_) and_not = _clone_signature(and_) From 60052558ea3ad9100abaea87f10813b5be771eca Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:22:14 +0100 Subject: [PATCH 176/231] fix: Allow `ChunkedArray` in `Table.set_column` (#211) Also being more consistent with `ArrayOrChunkedArray[Any]` everywhere Discovered in - https://github.com/vega/vega-datasets/blob/343b7101391a81190ba24e1e8d62a381d2fef3bd/scripts/species.py#L798-L799 --- pyarrow-stubs/__lib_pxi/table.pyi | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 0685b35a4cb..9838e0fa286 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -48,7 +48,7 @@ from pyarrow._stubs_typing import ( SupportArrowDeviceArray, SupportArrowStream, ) -from pyarrow.compute import Expression +from pyarrow.compute import ArrayOrChunkedArray, Expression from pyarrow.interchange.dataframe import _PyArrowDataFrame from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema @@ -452,7 +452,7 @@ def chunked_array( type: Literal["month_day_nano_interval"], ) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... -_ColumnT = TypeVar("_ColumnT", bound=Array | ChunkedArray) +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... @@ -474,7 +474,7 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): @classmethod def from_pydict( cls, - mapping: Mapping[str, ChunkedArray | Array | list | np.ndarray], + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list | np.ndarray], schema: Schema | None = None, metadata: Mapping | None = None, ) -> Self: ... @@ -507,9 +507,11 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def remove_column(self, i: int) -> Self: ... def drop_columns(self, columns: str | list[str]) -> Self: ... def add_column( - self, i: int, field_: str | Field, column: ChunkedArray | Array | list + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list ) -> Self: ... - def append_column(self, field_: str | Field, column: ChunkedArray | Array | list) -> Self: ... class RecordBatch(_Tabular[Array]): def validate(self, *, full: bool = False) -> None: ... @@ -588,7 +590,7 @@ JoinType: TypeAlias = Literal[ "full outer", ] -class Table(_Tabular[ChunkedArray]): +class Table(_Tabular[ChunkedArray[Any]]): def validate(self, *, full=False) -> None: ... def slice(self, offset=0, length=None) -> Self: ... def select(self, columns: list[str] | Indices) -> Self: ... @@ -613,7 +615,7 @@ class Table(_Tabular[ChunkedArray]): @classmethod def from_arrays( cls, - arrays: Collection[Array | ChunkedArray], + arrays: Collection[ArrayOrChunkedArray[Any]], names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, @@ -633,7 +635,9 @@ class Table(_Tabular[ChunkedArray]): def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... def get_total_buffer_size(self) -> int: ... def __sizeof__(self) -> int: ... - def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... @overload def rename_columns(self, names: list[str]) -> Self: ... @overload @@ -683,7 +687,7 @@ def table( ) -> Table: ... @overload def table( - data: Collection[Array | ChunkedArray] + data: Collection[ArrayOrChunkedArray[Any]] | pd.DataFrame | SupportArrowArray | SupportArrowStream From 483ce12bfb8c04329efda62615e3ce03f1e57249 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 14 Apr 2025 03:23:02 +0100 Subject: [PATCH 177/231] chore: Ignore `fsspec` `[import-untyped]` (#210) ```py _fs.pyi:18: error: Skipping analyzing "fsspec": module is installed, but missing library stubs or py.typed marker [import-untyped] _fs.pyi:18: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports Found 1 error in 1 file (checked 64 source files) ``` - https://github.com/fsspec/filesystem_spec/issues/625 - https://github.com/fsspec/filesystem_spec/pull/1676 --- pyarrow-stubs/_fs.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index c17d987d16d..67d7a601123 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -15,7 +15,7 @@ else: from typing import Union, overload -from fsspec import AbstractFileSystem +from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable From f39e9fd4c49afae45e852d1b3e835ac5925afe3f Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 15 Apr 2025 06:24:02 +0100 Subject: [PATCH 178/231] feat: Convert `types.is_*` into `TypeIs` guards (#215) * chore: Add `types.__all__` * feat: Convert `types._is_*` into `TypeIs` guards I've been using this for a little while, but makes more sense to live in the stubs https://github.com/narwhals-dev/narwhals/blob/16427440e6d74939c403083b52ce3fb0af7d63c7/narwhals/_arrow/utils.py#L44-L67 --- pyarrow-stubs/types.pyi | 245 +++++++++++++++++++++++++++++++--------- 1 file changed, 192 insertions(+), 53 deletions(-) diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi index ee89e497def..6c85d3b0d8c 100644 --- a/pyarrow-stubs/types.pyi +++ b/pyarrow-stubs/types.pyi @@ -1,55 +1,194 @@ -from pyarrow.lib import DataType +import sys -def is_null(t: DataType) -> bool: ... -def is_boolean(t: DataType) -> bool: ... -def is_integer(t: DataType) -> bool: ... -def is_signed_integer(t: DataType) -> bool: ... -def is_unsigned_integer(t: DataType) -> bool: ... -def is_int8(t: DataType) -> bool: ... -def is_int16(t: DataType) -> bool: ... -def is_int32(t: DataType) -> bool: ... -def is_int64(t: DataType) -> bool: ... -def is_uint8(t: DataType) -> bool: ... -def is_uint16(t: DataType) -> bool: ... -def is_uint32(t: DataType) -> bool: ... -def is_uint64(t: DataType) -> bool: ... -def is_floating(t: DataType) -> bool: ... -def is_float16(t: DataType) -> bool: ... -def is_float32(t: DataType) -> bool: ... -def is_float64(t: DataType) -> bool: ... -def is_list(t: DataType) -> bool: ... -def is_large_list(t: DataType) -> bool: ... -def is_fixed_size_list(t: DataType) -> bool: ... -def is_list_view(t: DataType) -> bool: ... -def is_large_list_view(t: DataType) -> bool: ... -def is_struct(t: DataType) -> bool: ... -def is_union(t: DataType) -> bool: ... -def is_nested(t: DataType) -> bool: ... -def is_run_end_encoded(t: DataType) -> bool: ... -def is_temporal(t: DataType) -> bool: ... -def is_timestamp(t: DataType) -> bool: ... -def is_duration(t: DataType) -> bool: ... -def is_time(t: DataType) -> bool: ... -def is_time32(t: DataType) -> bool: ... -def is_time64(t: DataType) -> bool: ... -def is_binary(t: DataType) -> bool: ... -def is_large_binary(t: DataType) -> bool: ... -def is_unicode(t: DataType) -> bool: ... -def is_string(t: DataType) -> bool: ... -def is_large_unicode(t: DataType) -> bool: ... -def is_large_string(t: DataType) -> bool: ... -def is_fixed_size_binary(t: DataType) -> bool: ... -def is_binary_view(t: DataType) -> bool: ... -def is_string_view(t: DataType) -> bool: ... -def is_date(t: DataType) -> bool: ... -def is_date32(t: DataType) -> bool: ... -def is_date64(t: DataType) -> bool: ... -def is_map(t: DataType) -> bool: ... -def is_decimal(t: DataType) -> bool: ... -def is_decimal32(t: DataType) -> bool: ... -def is_decimal64(t: DataType) -> bool: ... -def is_decimal128(t: DataType) -> bool: ... -def is_decimal256(t: DataType) -> bool: ... -def is_dictionary(t: DataType) -> bool: ... -def is_interval(t: DataType) -> bool: ... +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + Uint8Type, + Uint16Type, + Uint32Type, + Uint64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = Uint8Type | Uint16Type | Uint32Type | Uint64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[Uint8Type]: ... +def is_uint16(t: DataType) -> TypeIs[Uint16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[Uint64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... def is_primitive(t: DataType) -> bool: ... + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] From 0439ff36561fd8c48afa46b8cc3aa4e3464c729c Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 15 Apr 2025 06:24:44 +0100 Subject: [PATCH 179/231] fix: Resolve `bit_wise_and` overlaps (#214) Fixes 3 errors: ```py compute.pyi:608:5 - error: Overload 1 for "bit_wise_and" overlaps overload 4 and returns an incompatible type (reportOverlappingOverload) compute.pyi:608:5 - error: Overload 1 for "bit_wise_and" overlaps overload 5 and returns an incompatible type (reportOverlappingOverload) compute.pyi:620:5 - error: Overload 3 for "bit_wise_and" will never be used because its parameters overlap overload 1 (reportOverlappingOverload) ``` --- pyarrow-stubs/compute.pyi | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 1d0f6436b12..4bed5885cf8 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -617,19 +617,35 @@ def bit_wise_and( ) -> _NumericArrayT: ... @overload def bit_wise_and( - x: NumericScalar, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> NumericScalar: ... + x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... @overload def bit_wise_and( - x: NumericArray | NumericScalar, - y: NumericArray | NumericScalar, + x: Expression, + y: NumericScalar | ArrayOrChunkedArray[NumericScalar], /, *, memory_pool: lib.MemoryPool | None = None, -) -> NumericArray: ... +) -> Expression: ... @overload def bit_wise_and( - x: Expression | Any, y: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None + x: NumericScalar | ArrayOrChunkedArray[NumericScalar], + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... @overload def bit_wise_not( From 608fb8db7c559171cab170bd6e627b60a04d48d0 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 15 Apr 2025 06:26:33 +0100 Subject: [PATCH 180/231] fix: Resolve `list_*` overlapping overloads (#213) * fix: Resolve `list_value_length` overlaps * fix: Resolve `list_element` overlaps * fix: Resolve `list_(flatten|slice|parent_indices)` overlaps An improvement, but still not that accurate --- pyarrow-stubs/compute.pyi | 58 +++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 4bed5885cf8..fa270b7ac3c 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -132,12 +132,10 @@ BinaryScalar: TypeAlias = ( ) StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] ListScalar: TypeAlias = ( - lib.ListScalar[_DataTypeT] - | lib.LargeListScalar[_DataTypeT] - | lib.ListViewScalar[_DataTypeT] - | lib.LargeListViewScalar[_DataTypeT] - | lib.FixedSizeListScalar[_DataTypeT, Any] + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] ) TemporalScalar: TypeAlias = ( lib.Date32Scalar @@ -178,6 +176,9 @@ _StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArr _TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] _TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] # =============================== 1. Aggregation =============================== # ========================= 1.1 functions ========================= @@ -1962,19 +1963,26 @@ def if_else( @overload def list_value_length( - lists: lib.ListArray | lib.ListViewArray | lib.FixedSizeListArray | lib.ChunkedArray, + lists: _ListArray[Any], /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int32Array: ... @overload def list_value_length( - lists: lib.LargeListArray | lib.LargeListViewArray | lib.ChunkedArray, + lists: _LargeListArray[Any], /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Array: ... @overload +def list_value_length( + lists: ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array: ... +@overload def list_value_length( lists: Expression, /, @@ -2699,12 +2707,32 @@ def sort_indices( # ========================= 3.6 Structural transforms ========================= @overload def list_element( - lists: Expression, index, /, *, memory_pool: lib.MemoryPool | None = None + lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... @overload def list_element( - lists, index, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.ListArray: ... + lists: lib.Array[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: lib.ChunkedArray[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: ListScalar[_DataTypeT], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _DataTypeT: ... @overload def list_flatten( lists: Expression, @@ -2716,20 +2744,20 @@ def list_flatten( ) -> Expression: ... @overload def list_flatten( - lists, + lists: ArrayOrChunkedArray[ListScalar[Any]], /, recursive: bool = False, *, options: ListFlattenOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray: ... +) -> lib.ListArray[Any]: ... @overload def list_parent_indices( lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... @overload def list_parent_indices( - lists, /, *, memory_pool: lib.MemoryPool | None = None + lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... @overload def list_slice( @@ -2745,7 +2773,7 @@ def list_slice( ) -> Expression: ... @overload def list_slice( - lists, + lists: ArrayOrChunkedArray[Any], /, start: int, stop: int | None = None, @@ -2754,7 +2782,7 @@ def list_slice( *, options: ListSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray: ... +) -> lib.ListArray[Any]: ... def map_lookup( container, /, From 52a68a7b4c39a989bc19b53bb629dc3225eb5e66 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 15 Apr 2025 06:27:09 +0100 Subject: [PATCH 181/231] fix: Include `VarianceOptions` in `TableGroupBy.aggregate` (#212) - Follow-up to #197 - Noticed while writing up (https://github.com/narwhals-dev/narwhals/issues/2385) - We already use it for `std`, `var` in https://github.com/narwhals-dev/narwhals/blob/16427440e6d74939c403083b52ce3fb0af7d63c7/narwhals/_arrow/group_by.py#L81-L82 --- pyarrow-stubs/__lib_pxi/table.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 9838e0fa286..f539fb941f4 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -36,6 +36,7 @@ from pyarrow._compute import ( FunctionOptions, ScalarAggregateOptions, TDigestOptions, + VarianceOptions, ) from pyarrow._stubs_typing import ( Indices, @@ -111,7 +112,7 @@ _AggregationPrefixed: TypeAlias = Literal[ ] Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed AggregateOptions: TypeAlias = ( - ScalarAggregateOptions | CountOptions | TDigestOptions | FunctionOptions + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions ) UnarySelector: TypeAlias = str From 1d53a6c0798f8c45859f717ccf83c2fcab961e52 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:27:34 +0800 Subject: [PATCH 182/231] [pre-commit.ci] pre-commit autoupdate (#202) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.2 → v0.11.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.2...v0.11.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eca0c65f212..53f8509f692 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.2 + rev: v0.11.5 hooks: - id: ruff args: [--fix] From 2603c84b2181dd0d55883d63a772140dc1b7e315 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 15 Apr 2025 06:38:38 +0100 Subject: [PATCH 183/231] fix: Resolve `Scalar.as_py` warnings for `DictionaryType` (#207) > scalar.pyi:75:20 - warning: TypeVar "_AsPyTypeK" appears only once in generic function signature > Use "object" instead (reportInvalidTypeVarUse) > scalar.pyi:85:20 - warning: TypeVar "_AsPyTypeK" appears only once in generic function signature > Use "object" instead (reportInvalidTypeVarUse) Instead just using `int`, which should be all that is possible from: https://github.com/zen-xu/pyarrow-stubs/blob/02552b81161d19d4aa71d8656b028eefac84612b/pyarrow-stubs/__lib_pxi/types.pyi#L154-L164 https://github.com/zen-xu/pyarrow-stubs/blob/02552b81161d19d4aa71d8656b028eefac84612b/pyarrow-stubs/__lib_pxi/types.pyi#L63-L70 --- pyarrow-stubs/__lib_pxi/scalar.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 51b7860fbbb..0daa144ffc5 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -72,7 +72,7 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] ] ], - ) -> list[dict[_AsPyTypeK, _AsPyTypeV]]: ... + ) -> list[dict[int, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[ @@ -82,7 +82,7 @@ class Scalar(_Weakrefable, Generic[_DataType_CoT]): @overload def as_py( self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], - ) -> list[dict[_AsPyTypeK, Any]]: ... + ) -> list[dict[int, Any]]: ... @overload def as_py( self: Scalar[types.StructType], From d79fb2671d49e595f4b8ab5c94c7fe9b77de8288 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 17 Apr 2025 06:21:39 +0100 Subject: [PATCH 184/231] fix: Add default to `pc.sort_indices` (#216) * fix: Add default to `pc.sort_indices` Fixes https://github.com/narwhals-dev/narwhals/pull/2390#discussion_r2046472574 Default is specified in https://arrow.apache.org/docs/python/generated/pyarrow.compute.sort_indices.html * refactor: Reuse some aliases --- pyarrow-stubs/compute.pyi | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index fa270b7ac3c..c1f9c125386 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -82,6 +82,8 @@ from pyarrow._compute import register_aggregate_function as register_aggregate_f from pyarrow._compute import register_scalar_function as register_scalar_function from pyarrow._compute import register_tabular_function as register_tabular_function from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement from pyarrow._stubs_typing import ArrayLike, ScalarLike from . import lib @@ -2617,9 +2619,9 @@ def indices_nonzero( def array_sort_indices( array: lib.Array | lib.ChunkedArray, /, - order: Literal["ascending", "descending"] = "ascending", + order: _Order = "ascending", *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: ArraySortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... @@ -2627,9 +2629,9 @@ def array_sort_indices( def array_sort_indices( array: Expression, /, - order: Literal["ascending", "descending"] = "ascending", + order: _Order = "ascending", *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: ArraySortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... @@ -2639,7 +2641,7 @@ def partition_nth_indices( /, pivot: int, *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: PartitionNthOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... @@ -2649,16 +2651,16 @@ def partition_nth_indices( /, pivot: int, *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: PartitionNthOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... def rank( input: lib.Array | lib.ChunkedArray, /, - sort_keys: Literal["ascending", "descending"] = "ascending", + sort_keys: _Order = "ascending", *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", tiebreaker: Literal["min", "max", "first", "dense"] = "first", options: RankOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -2668,7 +2670,7 @@ def select_k_unstable( input: lib.Array | lib.ChunkedArray, /, k: int, - sort_keys: list[tuple[str, Literal["ascending", "descending"]]], + sort_keys: list[tuple[str, _Order]], *, options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -2678,7 +2680,7 @@ def select_k_unstable( input: Expression, /, k: int, - sort_keys: list[tuple[str, Literal["ascending", "descending"]]], + sort_keys: list[tuple[str, _Order]], *, options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, @@ -2687,9 +2689,9 @@ def select_k_unstable( def sort_indices( input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, /, - sort_keys: Sequence[tuple[str, Literal["ascending", "descending"]]], + sort_keys: Sequence[tuple[str, _Order]] = (), *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: SortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.UInt64Array: ... @@ -2697,9 +2699,9 @@ def sort_indices( def sort_indices( input: Expression, /, - sort_keys: Sequence[tuple[str, Literal["ascending", "descending"]]], + sort_keys: Sequence[tuple[str, _Order]] = (), *, - null_placement: Literal["at_start", "at_end"] = "at_end", + null_placement: _Placement = "at_end", options: SortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... From f899bb35e10b36f7906a728e9f8acf3e0a1f9f64 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 20 Apr 2025 04:39:27 +0100 Subject: [PATCH 185/231] fix: Allow `list_size` with `Field` in `pa.list_` (#218) Closes #217 --- pyarrow-stubs/__lib_pxi/types.pyi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 8d417008780..69e882638fd 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -426,13 +426,13 @@ large_utf8 = large_string def binary_view() -> BinaryViewType: ... def string_view() -> StringViewType: ... @overload -def list_(value_type: Field[_DataTypeT]) -> ListType[_DataTypeT]: ... +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... +) -> ListType[_DataTypeT]: ... @overload -def list_(value_type: _DataTypeT) -> ListType[_DataTypeT]: ... -@overload -def list_(value_type: _DataTypeT, list_size: Literal[-1]) -> ListType[_DataTypeT]: ... # type: ignore[overload-overlap] -@overload -def list_(value_type: _DataTypeT, list_size: _Size) -> FixedSizeListType[_DataTypeT, _Size]: ... +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size +) -> FixedSizeListType[_DataTypeT, _Size]: ... @overload def large_list(value_type: Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... @overload From e2ad3c5c67bd7b88212ef00c0b1640d8e76e6eb3 Mon Sep 17 00:00:00 2001 From: Tom Crasset <25140344+tcrasset@users.noreply.github.com> Date: Tue, 29 Apr 2025 04:30:08 +0200 Subject: [PATCH 186/231] allow `Table` or `RecordBatch` for dataset (#222) allow source argument pyarrow.dataset.dataset() to be RecordBatch | Table --- pyarrow-stubs/dataset.pyi | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index e9da4ec22b1..98f1a38aa85 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -195,6 +195,17 @@ def dataset( exclude_invalid_files: bool | None = None, ignore_prefixes: list[str] | None = None, ) -> InMemoryDataset: ... +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... def write_dataset( data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], base_dir: StrPath, From 28a54cce1440cb744f36ba7e88cd5a8a72c6e66f Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 30 Apr 2025 03:19:46 +0100 Subject: [PATCH 187/231] refactor: Simplify `types` overloads (#219) * fix: `binary` overlap * fix: Simplify list constructors, `_Ordered` * refactor: Use `_Tz` default --- pyarrow-stubs/__lib_pxi/types.pyi | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 69e882638fd..b3d0ecf11fa 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -164,7 +164,7 @@ _IndexT = TypeVar( ) _BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) _ValueT = TypeVar("_ValueT", bound=DataType) -_Ordered = TypeVar("_Ordered", bound=Literal[True, False], default=Literal[False]) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): @property @@ -380,7 +380,7 @@ def uint64() -> Uint64Type: ... def tzinfo_to_string(tz: dt.tzinfo) -> str: ... def string_to_tzinfo(name: str) -> dt.tzinfo: ... @overload -def timestamp(unit: _Unit) -> TimestampType[_Unit, None]: ... +def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... @overload def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: ... @@ -413,9 +413,7 @@ def string() -> StringType: ... utf8 = string @overload -def binary() -> BinaryType: ... -@overload -def binary(length: Literal[-1]) -> BinaryType: ... # type: ignore[overload-overlap] +def binary(length: Literal[-1] = ...) -> BinaryType: ... @overload def binary(length: int) -> FixedSizeBinaryType: ... def large_binary() -> LargeBinaryType: ... @@ -433,20 +431,13 @@ def list_( def list_( value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size ) -> FixedSizeListType[_DataTypeT, _Size]: ... +def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... +def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: ... @overload -def large_list(value_type: Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... -@overload -def large_list(value_type: _DataTypeT) -> LargeListType[_DataTypeT]: ... -@overload -def list_view(value_type: Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... -@overload -def list_view(value_type: _DataTypeT) -> ListViewType[_DataTypeT]: ... -@overload -def large_list_view(value_type: Field[_DataTypeT]) -> LargeListViewType[_DataTypeT]: ... -@overload -def large_list_view(value_type: _DataTypeT) -> LargeListViewType[_DataTypeT]: ... -@overload -def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, Literal[False]]: ... +def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... @overload def map_( key_type: _K, item_type: _ValueT, key_sorted: _Ordered @@ -454,7 +445,7 @@ def map_( @overload def dictionary( index_type: _IndexT, value_type: _BasicValueT -) -> DictionaryType[_IndexT, _BasicValueT, Literal[False]]: ... +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... @overload def dictionary( index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered From e0758efcc6fee2c2924b77fcf9d35f426534fb4a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 30 Apr 2025 14:26:49 +0800 Subject: [PATCH 188/231] fix: iter ChunkedArray should return scalar value (#224) --- pyarrow-stubs/__lib_pxi/table.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index f539fb941f4..e07961ecf7b 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -203,7 +203,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @property def chunks(self) -> list[Array[_Scalar_CoT]]: ... def iterchunks(self) -> Generator[Array[_Scalar_CoT], None, None]: ... - def __iter__(self) -> Iterator[Array[_Scalar_CoT]]: ... + def __iter__(self) -> Iterator[_Scalar_CoT]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], ) -> list[_AsPyType | None]: ... From 8fe826acfabb49a9fb84f9568d06c998a0c0455f Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 30 Apr 2025 14:33:10 +0800 Subject: [PATCH 189/231] release: 19.2 (#225) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 0aae52e6d1c..5ca76be9103 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1161,8 +1161,8 @@ packages: requires_python: '>=3.9' - pypi: . name: pyarrow-stubs - version: '19.1' - sha256: 3d889ce8db1d2fb7079c6fa894e7582e1e08d9a6beed12b83ca84f2e1bdc9fbc + version: '19.2' + sha256: 66bdbf64eaee62ff3dbf4f22f8aec74f3c5c4450bb382ca568182752046d42f4 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 72516b1e019..e4ec29da658 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "19.1" +version = "19.2" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 59bf9e0796af47f6f5b3fd1f8b1994622c04d68e Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 9 May 2025 07:16:05 +0100 Subject: [PATCH 190/231] fix: Add missing `DictionaryArray` methods/properties (#226) ## Docs - https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray.dictionary - https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray.indices - https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray.dictionary_decode - https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html#pyarrow.DictionaryArray.dictionary_encode ## Fixes - https://github.com/narwhals-dev/narwhals/blob/c23e56c56630761f0fbc58b575a1c987e57d58d5/narwhals/_arrow/series.py#L787-L798 - https://github.com/narwhals-dev/narwhals/blob/c23e56c56630761f0fbc58b575a1c987e57d58d5/narwhals/_arrow/series_cat.py#L14-L18 --- pyarrow-stubs/__lib_pxi/array.pyi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index fa237d7a270..e17a4489a9c 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1581,6 +1581,12 @@ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... @staticmethod def from_buffers( # type: ignore[override] type: _BasicValueT, From a4aa0c8ccd6aa99cb737a37d2accbc655392ef61 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 May 2025 15:47:06 +0800 Subject: [PATCH 191/231] chore: use pyright as static type checker (#227) * use pyright as static type checker * make pyright happy --- .github/workflows/lint.yaml | 20 ++- .pre-commit-config.yaml | 8 +- pixi.lock | 166 ++++++++++++++---------- pyarrow-stubs/__lib_pxi/array.pyi | 13 +- pyarrow-stubs/__lib_pxi/pandas_shim.pyi | 1 - pyarrow-stubs/__lib_pxi/scalar.pyi | 1 - pyarrow-stubs/__lib_pxi/table.pyi | 2 - pyarrow-stubs/__lib_pxi/tensor.pyi | 4 +- pyarrow-stubs/_stubs_typing.pyi | 2 +- pyarrow-stubs/compute.pyi | 1 - pyproject.toml | 11 +- 11 files changed, 134 insertions(+), 95 deletions(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 7f6e71e825c..9953e3a2db3 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -10,8 +10,8 @@ on: - synchronize jobs: - taplo-lint: - name: taplo lint + taplo: + name: taplo runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -19,3 +19,19 @@ jobs: with: version: "0.9.3" - run: taplo fmt --check + + pyright: + name: pyright + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.11" + cache: pyright + - run: | + python -m venv .venv + source .venv/bin/activate + pip install pandas numpy scipy sparse + - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH + - uses: jakebailey/pyright-action@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 53f8509f692..8e90c74f21c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,10 @@ ci: autofix_prs: false + skip: [pyright] default_language_version: python: python3.11 + node: 23.9.0 repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -25,7 +27,7 @@ repos: args: [--fix] - id: ruff-format - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.400 hooks: - - id: mypy + - id: pyright diff --git a/pixi.lock b/pixi.lock index 5ca76be9103..bc40f63a900 100644 --- a/pixi.lock +++ b/pixi.lock @@ -41,11 +41,11 @@ environments: - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - pypi: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl @@ -65,6 +65,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl @@ -102,11 +103,11 @@ environments: - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl - pypi: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl @@ -126,6 +127,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl - pypi: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl @@ -163,11 +165,11 @@ environments: - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl @@ -187,6 +189,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl @@ -227,11 +230,11 @@ environments: - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl @@ -249,6 +252,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl @@ -829,6 +833,26 @@ packages: purls: [] size: 55476 timestamp: 1727963768015 +- pypi: https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl + name: llvmlite + version: 0.44.0 + sha256: d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: llvmlite + version: 0.44.0 + sha256: c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl + name: llvmlite + version: 0.44.0 + sha256: eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl + name: llvmlite + version: 0.44.0 + sha256: ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427 + requires_python: '>=3.10' - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl name: matplotlib-inline version: 0.1.7 @@ -836,63 +860,6 @@ packages: requires_dist: - traitlets requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/18/0a/70de7c97a86cb85535077ab5cef1cbc4e2812fd2e9cc21d78eb561a6b80f/mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl - name: mypy - version: 1.12.1 - sha256: 1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735 - requires_dist: - - typing-extensions>=4.6.0 - - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' - - psutil>=4.0 ; extra == 'dmypy' - - pip ; extra == 'install-types' - - setuptools>=50 ; extra == 'mypyc' - - lxml ; extra == 'reports' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/48/41/1686f37d09c915dfc5b683e20cc99dabac199900b5ca6d22747b99ddcb50/mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl - name: mypy - version: 1.12.1 - sha256: a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6 - requires_dist: - - typing-extensions>=4.6.0 - - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' - - psutil>=4.0 ; extra == 'dmypy' - - pip ; extra == 'install-types' - - setuptools>=50 ; extra == 'mypyc' - - lxml ; extra == 'reports' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/54/55/710d082e91a2ccaea21214229b11f9215a9d22446f949491b5457655e82b/mypy-1.12.1-cp311-cp311-win_amd64.whl - name: mypy - version: 1.12.1 - sha256: 673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811 - requires_dist: - - typing-extensions>=4.6.0 - - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' - - psutil>=4.0 ; extra == 'dmypy' - - pip ; extra == 'install-types' - - setuptools>=50 ; extra == 'mypyc' - - lxml ; extra == 'reports' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/c0/97/9ed6d4834d7549936ab88533b302184fb568a0940c4000d2aaee6dc07112/mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl - name: mypy - version: 1.12.1 - sha256: 02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66 - requires_dist: - - typing-extensions>=4.6.0 - - mypy-extensions>=1.0.0 - - tomli>=1.1.0 ; python_full_version < '3.11' - - psutil>=4.0 ; extra == 'dmypy' - - pip ; extra == 'install-types' - - setuptools>=50 ; extra == 'mypyc' - - lxml ; extra == 'reports' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl - name: mypy-extensions - version: 1.0.0 - sha256: 4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d - requires_python: '>=3.5' - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda sha256: 6a1d5d8634c1a07913f1c525db6455918cbc589d745fac46d9d6e30340c8731a md5: 70caf8bb6cf39a0b6b7efc885f51c0fe @@ -946,6 +913,38 @@ packages: version: 20.18.0 sha256: 51c0cecb429a111351a54346909e672a57b96233a363c79cc0a2bbdbfa397304 requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl + name: numba + version: 0.61.2 + sha256: 76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1 + requires_dist: + - llvmlite>=0.44.0.dev0,<0.45 + - numpy>=1.24,<2.3 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl + name: numba + version: 0.61.2 + sha256: efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 + requires_dist: + - llvmlite>=0.44.0.dev0,<0.45 + - numpy>=1.24,<2.3 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl + name: numba + version: 0.61.2 + sha256: 49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b + requires_dist: + - llvmlite>=0.44.0.dev0,<0.45 + - numpy>=1.24,<2.3 + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + name: numba + version: 0.61.2 + sha256: 3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60 + requires_dist: + - llvmlite>=0.44.0.dev0,<0.45 + - numpy>=1.24,<2.3 + requires_python: '>=3.10' - pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl name: numpy version: 2.1.2 @@ -1162,7 +1161,7 @@ packages: - pypi: . name: pyarrow-stubs version: '19.2' - sha256: 66bdbf64eaee62ff3dbf4f22f8aec74f3c5c4450bb382ca568182752046d42f4 + sha256: 5ad928ccb7c6a7b4338efbbd55445cb15619ac81d1d5e78ed4e313b544220466 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' @@ -1524,6 +1523,41 @@ packages: version: 1.16.0 sha256: 8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' +- pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl + name: sparse + version: 0.16.0 + sha256: 25d4463cf36315ee16a19b6951f1d6b8e9128a07dafd58f846eb6dfb4cd5b9d8 + requires_dist: + - numpy>=1.17 + - numba>=0.49 + - mkdocs-material ; extra == 'docs' + - mkdocstrings[python] ; extra == 'docs' + - mkdocs-gen-files ; extra == 'docs' + - mkdocs-literate-nav ; extra == 'docs' + - mkdocs-section-index ; extra == 'docs' + - mkdocs-jupyter ; extra == 'docs' + - sparse[extras] ; extra == 'docs' + - dask[array] ; extra == 'extras' + - sparse[finch] ; extra == 'extras' + - scipy ; extra == 'extras' + - scikit-learn ; extra == 'extras' + - networkx ; extra == 'extras' + - sparse[extras] ; extra == 'tests' + - pytest>=3.5 ; extra == 'tests' + - pytest-cov ; extra == 'tests' + - pytest-xdist ; extra == 'tests' + - pre-commit ; extra == 'tests' + - pytest-codspeed ; extra == 'tests' + - sparse[tests] ; extra == 'tox' + - tox ; extra == 'tox' + - sparse[tests] ; extra == 'notebooks' + - nbmake ; extra == 'notebooks' + - matplotlib ; extra == 'notebooks' + - sparse[docs,mlir,notebooks,tox] ; extra == 'all' + - matrepr ; extra == 'all' + - finch-tensor>=0.2.10 ; extra == 'finch' + - finch-mlir>=0.0.2 ; extra == 'mlir' + requires_python: '>=3.10' - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl name: stack-data version: 0.6.3 diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index e17a4489a9c..3adeb4efadb 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1,5 +1,3 @@ -# mypy: disable-error-code="overload-overlap,misc,type-arg" - import datetime as dt import sys @@ -63,7 +61,6 @@ from .types import ( _IndexT, _RunEndType, _Size, - _ValueT, ) _T = TypeVar("_T") @@ -1627,25 +1624,25 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal def from_arrays( run_ends: Int16Array, values: Array, - type: _ValueT | None = None, + type: DataType | None = None, ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... @overload @staticmethod def from_arrays( run_ends: Int32Array, values: Array, - type: _ValueT | None = None, + type: DataType | None = None, ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... @overload @staticmethod def from_arrays( run_ends: Int64Array, values: Array, - type: _ValueT | None = None, + type: DataType | None = None, ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... @staticmethod - def from_buffers( # type: ignore[override] - type: _ValueT, + def from_buffers( + type: DataType, length: int, buffers: list[Buffer], null_count: int = -1, diff --git a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi index 1a57d7ca238..1eb7cdd9687 100644 --- a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi +++ b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi @@ -1,4 +1,3 @@ -# mypy: disable-error-code="name-defined" from types import ModuleType from typing import Any, Iterable, TypeGuard diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 0daa144ffc5..c6d33e7ac3f 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -1,4 +1,3 @@ -# mypy: disable-error-code="overload-overlap,misc,type-arg" import collections.abc import datetime as dt import sys diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index e07961ecf7b..983abe74edc 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -1,5 +1,3 @@ -# mypy: disable-error-code="overload-overlap,type-arg,misc" - import datetime as dt import sys diff --git a/pyarrow-stubs/__lib_pxi/tensor.pyi b/pyarrow-stubs/__lib_pxi/tensor.pyi index 01a1ea94e98..f59a2891306 100644 --- a/pyarrow-stubs/__lib_pxi/tensor.pyi +++ b/pyarrow-stubs/__lib_pxi/tensor.pyi @@ -1,5 +1,3 @@ -# mypy: disable-error-code="import-untyped" - import sys if sys.version_info >= (3, 11): @@ -10,8 +8,8 @@ else: import numpy as np from pyarrow.lib import _Weakrefable -from pydata.sparse import COO # type: ignore[import-not-found] from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO class Tensor(_Weakrefable): @classmethod diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi index 6c25aa214ca..c259513f1ea 100644 --- a/pyarrow-stubs/_stubs_typing.pyi +++ b/pyarrow-stubs/_stubs_typing.pyi @@ -29,7 +29,7 @@ Compression: TypeAlias = Literal[ NullEncoding: TypeAlias = Literal["mask", "encode"] NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray -Indices: TypeAlias = Sequence[int] | NDArray[np.integer] | IntegerArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray PyScalar: TypeAlias = ( bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta ) diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index c1f9c125386..10114e4ea7b 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -1,4 +1,3 @@ -# mypy: disable-error-code="misc,type-var,var-annotated" # ruff: noqa: I001 from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence from collections.abc import Callable diff --git a/pyproject.toml b/pyproject.toml index e4ec29da658..69eba429636 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,12 +48,12 @@ pyarrow-stubs = { path = ".", editable = true } ipython = "*" scipy = "*" pre-commit = "*" -mypy = ">=1.11" ruff = ">=0.5" types-cffi = "*" pandas-stubs = "*" hatchling = "*" fsspec = "*" +sparse = "*" pyright = { version = ">=1.1.385,<2", extras = ["nodejs"] } [tool.pixi.tasks] @@ -85,9 +85,6 @@ lines-between-types = 1 [tool.ruff.format] docstring-code-format = true -[tool.mypy] -explicit_package_bases = true -files = "pyarrow-stubs" -namespace_packages = true -show_error_codes = true -disable_error_code = ["overload-overlap", "import-not-found"] +[tool.pyright] +typeCheckingMode = "basic" +reportMissingImports = false From dcadf9769498bec4ce1cba56b13135a04c88b72d Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 May 2025 15:53:38 +0800 Subject: [PATCH 192/231] fix: fix pyright action (#229) fix github ci --- .github/workflows/lint.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 9953e3a2db3..7f437d4532c 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -28,7 +28,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: "3.11" - cache: pyright + cache: pip - run: | python -m venv .venv source .venv/bin/activate From e8a601482b5362a145c3cb3409c960780ac57ec0 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 9 May 2025 09:27:01 +0100 Subject: [PATCH 193/231] fix: Match runtime behavior of `(Table|RecordBatch).select` (#221) * fix: Match runtime behavior of `(Table|RecordBatch).select` ## Resolves - https://github.com/MarcoGorelli/narwhals/blob/5b02b592183b8d39e2d32e0aedd6c234bb22d405/narwhals/_arrow/dataframe.py#L305-L307 - https://github.com/MarcoGorelli/narwhals/blob/5b02b592183b8d39e2d32e0aedd6c234bb22d405/narwhals/_arrow/dataframe.py#L285-L294 ##Description Following up on what I thought was a simple stub issue, but we're both *too strict* and *too permissive* in different ways ##Examples {placeholder} ##Related - https://github.com/apache/arrow/blob/d2ddee62329eb711572b4d71d6380673d7f7edd1/python/pyarrow/table.pxi#L4367-L4374 - https://github.com/apache/arrow/blob/d2ddee62329eb711572b4d71d6380673d7f7edd1/python/pyarrow/table.pxi#L1721-L1739 * update select * update select --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/__lib_pxi/table.pyi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 983abe74edc..c6a21cd67d9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -28,6 +28,7 @@ from typing import ( import numpy as np import pandas as pd +from numpy.typing import NDArray from pyarrow._compute import ( CastOptions, CountOptions, @@ -525,7 +526,7 @@ class RecordBatch(_Tabular[Array]): def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... def equals(self, other: Self, check_metadata: bool = False) -> bool: ... - def select(self, columns: list[str] | Indices) -> Self: ... + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... def cast( self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None ) -> Self: ... @@ -592,7 +593,7 @@ JoinType: TypeAlias = Literal[ class Table(_Tabular[ChunkedArray[Any]]): def validate(self, *, full=False) -> None: ... def slice(self, offset=0, length=None) -> Self: ... - def select(self, columns: list[str] | Indices) -> Self: ... + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... def replace_schema_metadata(self, metadata: dict | None = None) -> Self: ... def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... From 1163c93e925b0b26bc402f7abec4fb45441d35aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 08:28:33 +0000 Subject: [PATCH 194/231] [pre-commit.ci] pre-commit autoupdate (#220) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.5 → v0.11.8](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.5...v0.11.8) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8e90c74f21c..efe8a1ce63b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.5 + rev: v0.11.8 hooks: - id: ruff args: [--fix] From 1fa9b2b5d16205de5cad25dd826764ecafba4c16 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 May 2025 17:27:42 +0800 Subject: [PATCH 195/231] feat: narrow scalar when type is given (#230) * rename Uint -> UInt * feat: narrow scalar when type is given --- pixi.lock | 2 +- pyarrow-stubs/__lib_pxi/array.pyi | 16 +- pyarrow-stubs/__lib_pxi/scalar.pyi | 366 ++++++++++++++++++++++++++++- pyarrow-stubs/__lib_pxi/types.pyi | 36 +-- pyarrow-stubs/compute.pyi | 6 +- pyarrow-stubs/types.pyi | 14 +- 6 files changed, 396 insertions(+), 44 deletions(-) diff --git a/pixi.lock b/pixi.lock index bc40f63a900..50b75afc1cc 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1161,7 +1161,7 @@ packages: - pypi: . name: pyarrow-stubs version: '19.2' - sha256: 5ad928ccb7c6a7b4338efbbd55445cb15619ac81d1d5e78ed4e313b544220466 + sha256: d7b0d9e3b3d3068254b52d20086e839023ab3a1c992ce8fa079fd0e4fe8a2164 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 3adeb4efadb..fa3c89b38d3 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -270,7 +270,7 @@ def array( @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.Uint16Type, + type: Literal["u2", "uint16"] | types.UInt16Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, @@ -290,7 +290,7 @@ def array( @overload def array( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.Uint64Type, + type: Literal["u8", "uint64"] | types.UInt64Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, @@ -521,12 +521,12 @@ def asarray( @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.Uint8Type, + type: Literal["u1", "uint8"] | types.UInt8Type, ) -> UInt8Array: ... @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.Uint16Type, + type: Literal["u2", "uint16"] | types.UInt16Type, ) -> UInt16Array: ... @overload def asarray( @@ -536,7 +536,7 @@ def asarray( @overload def asarray( values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.Uint64Type, + type: Literal["u8", "uint64"] | types.UInt64Type, ) -> UInt64Array: ... @overload def asarray( @@ -651,11 +651,11 @@ def nulls( ) -> Int64Array: ... @overload def nulls( - size: int, types: types.Uint8Type, memory_pool: MemoryPool | None = None + size: int, types: types.UInt8Type, memory_pool: MemoryPool | None = None ) -> UInt8Array: ... @overload def nulls( - size: int, types: types.Uint16Type, memory_pool: MemoryPool | None = None + size: int, types: types.UInt16Type, memory_pool: MemoryPool | None = None ) -> UInt16Array: ... @overload def nulls( @@ -663,7 +663,7 @@ def nulls( ) -> UInt32Array: ... @overload def nulls( - size: int, types: types.Uint64Type, memory_pool: MemoryPool | None = None + size: int, types: types.UInt64Type, memory_pool: MemoryPool | None = None ) -> UInt64Array: ... @overload def nulls( diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index c6d33e7ac3f..eea26f90770 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -108,21 +108,21 @@ NA = _NULL class NullScalar(Scalar[types.NullType]): ... class BooleanScalar(Scalar[types.BoolType]): ... -class UInt8Scalar(Scalar[types.Uint8Type]): ... +class UInt8Scalar(Scalar[types.UInt8Type]): ... class Int8Scalar(Scalar[types.Int8Type]): ... -class UInt16Scalar(Scalar[types.Uint16Type]): ... +class UInt16Scalar(Scalar[types.UInt16Type]): ... class Int16Scalar(Scalar[types.Int16Type]): ... class UInt32Scalar(Scalar[types.Uint32Type]): ... class Int32Scalar(Scalar[types.Int32Type]): ... -class UInt64Scalar(Scalar[types.Uint64Type]): ... +class UInt64Scalar(Scalar[types.UInt64Type]): ... class Int64Scalar(Scalar[types.Int64Type]): ... class HalfFloatScalar(Scalar[types.Float16Type]): ... class FloatScalar(Scalar[types.Float32Type]): ... class DoubleScalar(Scalar[types.Float64Type]): ... -class Decimal32Scalar(Scalar[types.Decimal32Type]): ... -class Decimal64Scalar(Scalar[types.Decimal64Type]): ... -class Decimal128Scalar(Scalar[types.Decimal128Type]): ... -class Decimal256Scalar(Scalar[types.Decimal256Type]): ... +class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... +class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... class Date32Scalar(Scalar[types.Date32Type]): ... class Date64Scalar(Scalar[types.Date64Type]): @@ -421,6 +421,358 @@ def scalar( memory_pool: MemoryPool | None = None, ) -> ListScalar[Any]: ... @overload +def scalar( + value: Any, + type: types.NullType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> NullScalar: ... +@overload +def scalar( + value: Any, + type: types.BoolType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: Any, + type: types.UInt8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt8Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Uint32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int32Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Float16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> HalfFloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Any, + type: types.Date32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Date64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date64Scalar: ... +@overload +def scalar( + value: Any, + type: types.MonthDayNanoIntervalType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Any, + type: types.StringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeStringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeStringScalar: ... +@overload +def scalar( + value: Any, + type: types.StringViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringViewScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeBinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryViewScalar: ... +@overload +def scalar( + value: Any, + type: types.TimestampType[types._Unit, types._Tz], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[types._Unit, types._Tz]: ... +@overload +def scalar( + value: Any, + type: types.Time32Type[types._Time32Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time32Scalar[types._Time32Unit]: ... +@overload +def scalar( + value: Any, + type: types.Time64Type[types._Time64Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[types._Time64Unit]: ... +@overload +def scalar( + value: Any, + type: types.DurationType[types._Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[types._Unit]: ... +@overload +def scalar( + value: Any, + type: types.Decimal32Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal32Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal64Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal64Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal128Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal256Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal256Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.ListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.ListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.FixedSizeListType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListScalar: ... +@overload +def scalar( + value: Any, + type: types.DictionaryType[types._IndexT, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.MapType[types._K, types._ValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MapScalar[types._K, types._ValueT]: ... +@overload +def scalar( + value: Any, + type: types.StringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: Any, + type: types.UnionType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UnionScalar: ... +@overload +def scalar( + value: Any, + type: types.RunEndEncodedType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedScalar: ... +@overload +def scalar( + value: Any, + type: types.Bool8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Bool8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UuidType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UuidScalar: ... +@overload +def scalar( + value: Any, + type: types.JsonType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> JsonScalar: ... +@overload +def scalar( + value: Any, + type: types.OpaqueType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> OpaqueScalar: ... +@overload def scalar( value: Any, type: _DataTypeT, diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index b3d0ecf11fa..8118cb96309 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -60,13 +60,13 @@ class DataType(_Weakrefable): class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... -class Uint8Type(_BasicDataType[int]): ... +class UInt8Type(_BasicDataType[int]): ... class Int8Type(_BasicDataType[int]): ... -class Uint16Type(_BasicDataType[int]): ... +class UInt16Type(_BasicDataType[int]): ... class Int16Type(_BasicDataType[int]): ... class Uint32Type(_BasicDataType[int]): ... class Int32Type(_BasicDataType[int]): ... -class Uint64Type(_BasicDataType[int]): ... +class UInt64Type(_BasicDataType[int]): ... class Int64Type(_BasicDataType[int]): ... class Float16Type(_BasicDataType[float]): ... class Float32Type(_BasicDataType[float]): ... @@ -153,13 +153,13 @@ class DictionaryMemo(_Weakrefable): ... _IndexT = TypeVar( "_IndexT", - Uint8Type, + UInt8Type, Int8Type, - Uint16Type, + UInt16Type, Int16Type, Uint32Type, Int32Type, - Uint64Type, + UInt64Type, Int64Type, ) _BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) @@ -369,14 +369,14 @@ def field( ) -> Field[_DataTypeT]: ... def null() -> NullType: ... def bool_() -> BoolType: ... -def uint8() -> Uint8Type: ... +def uint8() -> UInt8Type: ... def int8() -> Int8Type: ... -def uint16() -> Uint16Type: ... +def uint16() -> UInt16Type: ... def int16() -> Int16Type: ... def uint32() -> Uint32Type: ... def int32() -> Int32Type: ... def int64() -> Int64Type: ... -def uint64() -> Uint64Type: ... +def uint64() -> UInt64Type: ... def tzinfo_to_string(tz: dt.tzinfo) -> str: ... def string_to_tzinfo(name: str) -> dt.tzinfo: ... @overload @@ -493,13 +493,13 @@ def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... @overload def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... @overload -def type_for_alias(name: Literal["u1", "uint8"]) -> Uint8Type: ... +def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... @overload -def type_for_alias(name: Literal["u2", "uint16"]) -> Uint16Type: ... +def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... @overload def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... @overload -def type_for_alias(name: Literal["u8", "uint64"]) -> Uint64Type: ... +def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... @overload def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... @overload @@ -567,13 +567,13 @@ def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... @overload def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... @overload -def ensure_type(ty: Literal["u1", "uint8"]) -> Uint8Type: ... +def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... @overload -def ensure_type(ty: Literal["u2", "uint16"]) -> Uint16Type: ... +def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... @overload def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... @overload -def ensure_type(ty: Literal["u8", "uint64"]) -> Uint64Type: ... +def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... @overload def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... @overload @@ -640,13 +640,13 @@ __all__ = [ "_BasicDataType", "NullType", "BoolType", - "Uint8Type", + "UInt8Type", "Int8Type", - "Uint16Type", + "UInt16Type", "Int16Type", "Uint32Type", "Int32Type", - "Uint64Type", + "UInt64Type", "Int64Type", "Float16Type", "Float32Type", diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 10114e4ea7b..3408d77b9c0 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -109,10 +109,10 @@ SignedIntegerScalar: TypeAlias = ( | lib.Scalar[lib.Int64Type] ) UnsignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.Uint8Type] - | lib.Scalar[lib.Uint16Type] + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] | lib.Scalar[lib.Uint32Type] - | lib.Scalar[lib.Uint64Type] + | lib.Scalar[lib.UInt64Type] ) IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar FloatScalar: TypeAlias = ( diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi index 6c85d3b0d8c..0cb4f6171d3 100644 --- a/pyarrow-stubs/types.pyi +++ b/pyarrow-stubs/types.pyi @@ -51,14 +51,14 @@ from pyarrow.lib import ( Time32Type, Time64Type, TimestampType, - Uint8Type, - Uint16Type, + UInt8Type, + UInt16Type, Uint32Type, - Uint64Type, + UInt64Type, ) _SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type -_UnsignedInteger: TypeAlias = Uint8Type | Uint16Type | Uint32Type | Uint64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type _Integer: TypeAlias = _SignedInteger | _UnsignedInteger _Floating: TypeAlias = Float16Type | Float32Type | Float64Type _Decimal: TypeAlias = ( @@ -92,10 +92,10 @@ def is_int8(t: DataType) -> TypeIs[Int8Type]: ... def is_int16(t: DataType) -> TypeIs[Int16Type]: ... def is_int32(t: DataType) -> TypeIs[Int32Type]: ... def is_int64(t: DataType) -> TypeIs[Int64Type]: ... -def is_uint8(t: DataType) -> TypeIs[Uint8Type]: ... -def is_uint16(t: DataType) -> TypeIs[Uint16Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... -def is_uint64(t: DataType) -> TypeIs[Uint64Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... def is_floating(t: DataType) -> TypeIs[_Floating]: ... def is_float16(t: DataType) -> TypeIs[Float16Type]: ... def is_float32(t: DataType) -> TypeIs[Float32Type]: ... From 8a2c16444f2293f6df224148e26d8250a74ebd5d Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 9 May 2025 17:30:00 +0800 Subject: [PATCH 196/231] release 19.3 (#231) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 50b75afc1cc..3525e896e1f 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1160,8 +1160,8 @@ packages: requires_python: '>=3.9' - pypi: . name: pyarrow-stubs - version: '19.2' - sha256: d7b0d9e3b3d3068254b52d20086e839023ab3a1c992ce8fa079fd0e4fe8a2164 + version: '19.3' + sha256: 2ee8116bcd4a422fb5bb8f78c2a46dea7a5dc000f384ae0e77cdb242787d66a8 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 69eba429636..345f08935bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "19.2" +version = "19.3" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 213230fe6aed839191d75724ff34d44d133a4bc7 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 14 May 2025 10:58:19 +0800 Subject: [PATCH 197/231] chore: pyright use strict mode (#233) * fix types * update array.pyi * update scalar.pyi * update * update array * update array * optimize chunked_array * optimizer iterchunks * update * update pyproject.toml --- pixi.lock | 456 ++++++++++++++++------------ pyarrow-stubs/__lib_pxi/array.pyi | 466 ++++++++++++++++++----------- pyarrow-stubs/__lib_pxi/ipc.pyi | 4 +- pyarrow-stubs/__lib_pxi/scalar.pyi | 150 ++++++---- pyarrow-stubs/__lib_pxi/table.pyi | 340 ++++++++++++++++----- pyarrow-stubs/__lib_pxi/types.pyi | 70 +++-- pyarrow-stubs/_csv.pyi | 10 +- pyarrow-stubs/_json.pyi | 4 +- pyarrow-stubs/compute.pyi | 66 ++-- pyproject.toml | 11 +- 10 files changed, 1002 insertions(+), 575 deletions(-) diff --git a/pixi.lock b/pixi.lock index 3525e896e1f..93a2e46d8a2 100644 --- a/pixi.lock +++ b/pixi.lock @@ -12,24 +12,26 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.2-hee588c1_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.12-h9e4cc4f_0_cpython.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl @@ -79,19 +81,20 @@ environments: osx-64: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.7.0-h240833e_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.6-h281671d_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.1-hd471939_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.49.2-hdb6dae5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.5.0-hc426f3f_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.12-h9ccd52b_0_cpython.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl @@ -141,19 +144,20 @@ environments: osx-arm64: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.7.0-h286801f_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.6-h1da3d7d_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.1-h39f12f2_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.49.2-h3f77e49_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.5.0-h81ee809_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.12-hc22306f_0_cpython.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl @@ -203,21 +207,21 @@ environments: win-64: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.0-he0c23c2_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.6-h537db12_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.49.2-h67fdade_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.5.0-ha4e3fda_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.12-h3f84c4b_0_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl @@ -649,43 +653,100 @@ packages: purls: [] size: 669616 timestamp: 1727304687962 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 - sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e - md5: d645c6d2ac96843a2bfaccd2d62b3ac3 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda + sha256: 33ab03438aee65d6aa667cf7d90c91e5e7d734c19a67aa4c7040742c0a13d505 + md5: db0bfbe7dd197b68ad5f30333bae6ce0 depends: - - libgcc-ng >=9.4.0 + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - expat 2.7.0.* + license: MIT + license_family: MIT + purls: [] + size: 74427 + timestamp: 1743431794976 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.7.0-h240833e_0.conda + sha256: 976f2e23ad2bb2b8e92c99bfa2ead3ad557b17a129b170f7e2dfcf233193dd7e + md5: 026d0a1056ba2a3dbbea6d4b08188676 + depends: + - __osx >=10.13 + constrains: + - expat 2.7.0.* + license: MIT + license_family: MIT + purls: [] + size: 71894 + timestamp: 1743431912423 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.7.0-h286801f_0.conda + sha256: ee550e44765a7bbcb2a0216c063dcd53ac914a7be5386dd0554bd06e6be61840 + md5: 6934bbb74380e045741eb8637641a65b + depends: + - __osx >=11.0 + constrains: + - expat 2.7.0.* + license: MIT + license_family: MIT + purls: [] + size: 65714 + timestamp: 1743431789879 +- conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.0-he0c23c2_0.conda + sha256: 1a227c094a4e06bd54e8c2f3ec40c17ff99dcf3037d812294f842210aa66dbeb + md5: b6f5352fdb525662f4169a0431d2dd7a + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + constrains: + - expat 2.7.0.* license: MIT license_family: MIT purls: [] - size: 58292 - timestamp: 1636488182923 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 - sha256: 7a2d27a936ceee6942ea4d397f9c7d136f12549d86f7617e8b6bad51e01a941f - md5: ccb34fb14960ad8b125962d3d79b31a9 + size: 140896 + timestamp: 1743432122520 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + sha256: 764432d32db45466e87f10621db5b74363a9f847d2b8b1f9743746cd160f06ab + md5: ede4673863426c0883c0063d853bbd85 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 license: MIT license_family: MIT purls: [] - size: 51348 - timestamp: 1636488394370 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2 - sha256: 41b3d13efb775e340e4dba549ab5c029611ea6918703096b2eaa9c015c0750ca - md5: 086914b672be056eb70fd4285b6783b6 + size: 57433 + timestamp: 1743434498161 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.6-h281671d_1.conda + sha256: 6394b1bc67c64a21a5cc73d1736d1d4193a64515152e861785c44d2cfc49edf3 + md5: 4ca9ea59839a9ca8df84170fab4ceb41 + depends: + - __osx >=10.13 + license: MIT + license_family: MIT + purls: [] + size: 51216 + timestamp: 1743434595269 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.6-h1da3d7d_1.conda + sha256: c6a530924a9b14e193ea9adfe92843de2a806d1b7dbfd341546ece9653129e60 + md5: c215a60c2935b517dcda8cad4705734d + depends: + - __osx >=11.0 license: MIT license_family: MIT purls: [] - size: 39020 - timestamp: 1636488587153 -- conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2 - sha256: 1951ab740f80660e9bc07d2ed3aefb874d78c107264fd810f24a1a6211d4b1a5 - md5: 2c96d1b6915b408893f9472569dee135 + size: 39839 + timestamp: 1743434670405 +- conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.6-h537db12_1.conda + sha256: d3b0b8812eab553d3464bbd68204f007f1ebadf96ce30eb0cbc5159f72e353f5 + md5: 85d8fa5e55ed8f93f874b3b23ed54ec6 depends: - - vc >=14.1,<15.0a0 - - vs2015_runtime >=14.16.27012 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 license: MIT license_family: MIT purls: [] - size: 42063 - timestamp: 1636489106777 + size: 44978 + timestamp: 1743435053850 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda sha256: 53eb8a79365e58849e7b1a068d31f4f9e718dc938d6f2c03e960345739a03569 md5: 3cb76c3f10d3bc7f1105b2fc9db984df @@ -720,6 +781,57 @@ packages: purls: [] size: 460992 timestamp: 1729027639220 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda + sha256: eeff241bddc8f1b87567dd6507c9f441f7f472c27f0860a07628260c000ef27c + md5: a76fd702c93cd2dfd89eff30a5fd45a8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - xz 5.8.1.* + - xz ==5.8.1=*_1 + license: 0BSD + purls: [] + size: 112845 + timestamp: 1746531470399 +- conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.1-hd471939_1.conda + sha256: 20a4c5291f3e338548013623bb1dc8ee2fba5dbac8f77acaddd730ed2a7d29b6 + md5: f87e8821e0e38a4140a7ed4f52530053 + depends: + - __osx >=10.13 + constrains: + - xz 5.8.1.* + - xz ==5.8.1=*_1 + license: 0BSD + purls: [] + size: 104814 + timestamp: 1746531577001 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.1-h39f12f2_1.conda + sha256: 5ab62c179229640c34491a7de806ad4ab7bec47ea2b5fc2136e3b8cf5ef26a57 + md5: 4e8ef3d79c97c9021b34d682c24c2044 + depends: + - __osx >=11.0 + constrains: + - xz 5.8.1.* + - xz ==5.8.1=*_1 + license: 0BSD + purls: [] + size: 92218 + timestamp: 1746531818330 +- conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_1.conda + sha256: adbf6c7bde70536ada734a81b8b5aa23654f2b95445204404622e0cc40e921a0 + md5: 14a1042c163181e143a7522dfb8ad6ab + depends: + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + constrains: + - xz 5.8.1.* + - xz ==5.8.1=*_1 + license: 0BSD + purls: [] + size: 104699 + timestamp: 1746531718026 - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6 md5: 30fd6e37fe21f86f4bd26d6ee73eeec7 @@ -730,48 +842,48 @@ packages: purls: [] size: 33408 timestamp: 1697359010159 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.46.1-hadc24fc_0.conda - sha256: 9851c049abafed3ee329d6c7c2033407e2fc269d33a75c071110ab52300002b0 - md5: 36f79405ab16bf271edb55b213836dac +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.2-hee588c1_0.conda + sha256: 525d4a0e24843f90b3ff1ed733f0a2e408aa6dd18b9d4f15465595e078e104a2 + md5: 93048463501053a00739215ea3f36324 depends: - __glibc >=2.17,<3.0.a0 - libgcc >=13 - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 865214 - timestamp: 1725353659783 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.46.1-h4b8f8c9_0.conda - sha256: 1d075cb823f0cad7e196871b7c57961d669cbbb6cd0e798bf50cbf520dda65fb - md5: 84de0078b58f899fc164303b0603ff0e + size: 916313 + timestamp: 1746637007836 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.49.2-hdb6dae5_0.conda + sha256: 8fd9562478b4d1dc90ab2bcad5289ee2b5a971ca8ad87e6b137ce0ca53bf801d + md5: 9377ba1ade655ea3fc831b456f4a2351 depends: - __osx >=10.13 - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 908317 - timestamp: 1725353652135 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.1-hc14010f_0.conda - sha256: 3725f962f490c5d44dae326d5f5b2e3c97f71a6322d914ccc85b5ddc2e50d120 - md5: 58050ec1724e58668d0126a1615553fa + size: 977388 + timestamp: 1746637093883 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.49.2-h3f77e49_0.conda + sha256: d89f979497cf56eccb099b6ab9558da7bba1f1ba264f50af554e0ea293d9dcf9 + md5: 85f443033cd5b3df82b5cabf79bddb09 depends: - __osx >=11.0 - libzlib >=1.3.1,<2.0a0 license: Unlicense purls: [] - size: 829500 - timestamp: 1725353720793 -- conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.1-h2466b09_0.conda - sha256: ef83f90961630bc54a95e48062b05cf9c9173a822ea01784288029613a45eea4 - md5: 8a7c1ad01f58623bfbae8d601db7cf3b + size: 899462 + timestamp: 1746637228408 +- conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.49.2-h67fdade_0.conda + sha256: 1612baa49124ec1972b085ab9d6bf1855c5f38e8f16e8d8f96c193d6e11688b2 + md5: a3900c97ba9e03332e9a911fe63f7d64 depends: - ucrt >=10.0.20348.0 - vc >=14.2,<15 - vc14_runtime >=14.29.30139 license: Unlicense purls: [] - size: 876666 - timestamp: 1725354171439 + size: 1081123 + timestamp: 1746637406471 - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 md5: 40b61aab5c7ba9ff276c41cfffe6b80b @@ -782,6 +894,15 @@ packages: purls: [] size: 33601 timestamp: 1680112270483 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda + sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c + md5: 5aa797f8787fe7a17d1b0821485b5adc + depends: + - libgcc-ng >=12 + license: LGPL-2.1-or-later + purls: [] + size: 100393 + timestamp: 1702724383534 - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 md5: edb0dca6bc32e4f4789199455a1dbeb8 @@ -965,9 +1086,9 @@ packages: version: 2.1.2 sha256: f1eb068ead09f4994dec71c24b2844f1e4e4e013b9629f812f292f04bd1510d9 requires_python: '>=3.10' -- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda - sha256: cee91036686419f6dd6086902acf7142b4916e1c4ba042e9ca23e151da012b6d - md5: 4d638782050ab6faa27275bed57e9b4e +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda + sha256: b4491077c494dbf0b5eaa6d87738c22f2154e9277e5293175ec187634bd808a0 + md5: de356753cfdbffcde5bb1e86e3aa6cd0 depends: - __glibc >=2.17,<3.0.a0 - ca-certificates @@ -975,33 +1096,33 @@ packages: license: Apache-2.0 license_family: Apache purls: [] - size: 2891789 - timestamp: 1725410790053 -- conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.3.2-hd23fc13_0.conda - sha256: 2b75d4b56e45992adf172b158143742daeb316c35274b36f385ccb6644e93268 - md5: 2ff47134c8e292868a4609519b1ea3b6 + size: 3117410 + timestamp: 1746223723843 +- conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.5.0-hc426f3f_1.conda + sha256: bcac94cb82a458b4e3164da8d9bced08cc8c3da2bc3bd7330711a3689c1464a5 + md5: 919faa07b9647beb99a0e7404596a465 depends: - __osx >=10.13 - ca-certificates license: Apache-2.0 license_family: Apache purls: [] - size: 2544654 - timestamp: 1725410973572 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.2-h8359307_0.conda - sha256: 940fa01c4dc6152158fe8943e05e55a1544cab639df0994e3b35937839e4f4d1 - md5: 1773ebccdc13ec603356e8ff1db9e958 + size: 2739181 + timestamp: 1746224401118 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.5.0-h81ee809_1.conda + sha256: 73d366c1597a10bcd5f3604b5f0734b31c23225536e03782c6a13f9be9d01bff + md5: 5c7aef00ef60738a14e0e612cfc5bcde depends: - __osx >=11.0 - ca-certificates license: Apache-2.0 license_family: Apache purls: [] - size: 2882450 - timestamp: 1725410638874 -- conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.2-h2466b09_0.conda - sha256: a45c42f3577294e22ac39ddb6ef5a64fd5322e8a6725afefbf4f2b4109340bf9 - md5: 1dc86753693df5e3326bb8a85b74c589 + size: 3064197 + timestamp: 1746223530698 +- conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.5.0-ha4e3fda_1.conda + sha256: 02846553d2a4c9bde850c60824d0f02803eb9c9b674d5c1a8cce25bc387e748f + md5: 72c07e46b6766bb057018a9a74861b89 depends: - ca-certificates - ucrt >=10.0.20348.0 @@ -1010,8 +1131,8 @@ packages: license: Apache-2.0 license_family: Apache purls: [] - size: 8396053 - timestamp: 1725412961673 + size: 9025176 + timestamp: 1746227349882 - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl name: packaging version: '24.1' @@ -1161,7 +1282,7 @@ packages: - pypi: . name: pyarrow-stubs version: '19.3' - sha256: 2ee8116bcd4a422fb5bb8f78c2a46dea7a5dc000f384ae0e77cdb242787d66a8 + sha256: 58bce9ff799d39ea412fbed515fab1f8d7d309f9705b3333d8336113cdb4dc98 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' @@ -1185,93 +1306,99 @@ packages: - twine>=3.4.1 ; extra == 'dev' - nodejs-wheel-binaries ; extra == 'nodejs' requires_python: '>=3.7' -- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.0-he550d4f_1_cpython.conda - build_number: 1 - sha256: 464f998e406b645ba34771bb53a0a7c2734e855ee78dd021aa4dedfdb65659b7 - md5: 8d14fc2aa12db370a443753c8230be1e +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.12-h9e4cc4f_0_cpython.conda + sha256: 028a03968eb101a681fa4966b2c52e93c8db1e934861f8d108224f51ba2c1bc9 + md5: b61d4fbf583b8393d9d00ec106ad3658 depends: + - __glibc >=2.17,<3.0.a0 - bzip2 >=1.0.8,<2.0a0 - ld_impl_linux-64 >=2.36.1 - - libffi >=3.4,<4.0a0 - - libgcc-ng >=12 - - libnsl >=2.0.0,<2.1.0a0 - - libsqlite >=3.40.0,<4.0a0 - - libuuid >=2.32.1,<3.0a0 - - libzlib >=1.2.13,<2.0.0a0 - - ncurses >=6.3,<7.0a0 - - openssl >=3.0.7,<4.0a0 - - readline >=8.1.2,<9.0a0 - - tk >=8.6.12,<8.7.0a0 + - libexpat >=2.7.0,<3.0a0 + - libffi >=3.4.6,<3.5.0a0 + - libgcc >=13 + - liblzma >=5.8.1,<6.0a0 + - libnsl >=2.0.1,<2.1.0a0 + - libsqlite >=3.49.1,<4.0a0 + - libuuid >=2.38.1,<3.0a0 + - libxcrypt >=4.4.36 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.0,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 - tzdata - - xz >=5.2.6,<6.0a0 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 31476523 - timestamp: 1673700777998 -- conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.0-he7542f4_1_cpython.conda - build_number: 1 - sha256: 5c069c9908e48a4490a56d3752c0bc93c2fc93ab8d8328efc869fdc707618e9f - md5: 9ecfa530b33aefd0d22e0272336f638a + size: 30545496 + timestamp: 1744325586785 +- conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.12-h9ccd52b_0_cpython.conda + sha256: fcd4b8a9a206940321d1d6569ddac2e99f359f0d5864e48140374a85aed5c27f + md5: cfa36957cba60dca8e79a974d09b6a2c depends: + - __osx >=10.13 - bzip2 >=1.0.8,<2.0a0 - - libffi >=3.4,<4.0a0 - - libsqlite >=3.40.0,<4.0a0 - - libzlib >=1.2.13,<2.0.0a0 - - ncurses >=6.3,<7.0a0 - - openssl >=3.0.7,<4.0a0 - - readline >=8.1.2,<9.0a0 - - tk >=8.6.12,<8.7.0a0 + - libexpat >=2.7.0,<3.0a0 + - libffi >=3.4.6,<3.5.0a0 + - liblzma >=5.8.1,<6.0a0 + - libsqlite >=3.49.1,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.0,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 - tzdata - - xz >=5.2.6,<6.0a0 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 15410083 - timestamp: 1673762717308 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.0-h3ba56d0_1_cpython.conda - build_number: 1 - sha256: 28a54d78cd2624a12bd2ceb0f1816b0cba9b4fd97df846b5843b3c1d51642ab2 - md5: 2aa7ca3702d9afd323ca34a9d98879d1 + size: 15467842 + timestamp: 1744324543915 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.12-hc22306f_0_cpython.conda + sha256: ea91eb5bc7160cbc6f8110702f9250c87e378ff1dc83ab8daa8ae7832fb5d0de + md5: 6ab5f6a9e85f1b1848b6518e7eea63ee depends: + - __osx >=11.0 - bzip2 >=1.0.8,<2.0a0 - - libffi >=3.4,<4.0a0 - - libsqlite >=3.40.0,<4.0a0 - - libzlib >=1.2.13,<2.0.0a0 - - ncurses >=6.3,<7.0a0 - - openssl >=3.0.7,<4.0a0 - - readline >=8.1.2,<9.0a0 - - tk >=8.6.12,<8.7.0a0 + - libexpat >=2.7.0,<3.0a0 + - libffi >=3.4.6,<3.5.0a0 + - liblzma >=5.8.1,<6.0a0 + - libsqlite >=3.49.1,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.0,<4.0a0 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 - tzdata - - xz >=5.2.6,<6.0a0 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 14492975 - timestamp: 1673699560906 -- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2 - sha256: 20d1f1b5dc620b745c325844545fd5c0cdbfdb2385a0e27ef1507399844c8c6d - md5: 13ee3577afc291dabd2d9edc59736688 + size: 13584762 + timestamp: 1744323773319 +- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.12-h3f84c4b_0_cpython.conda + sha256: 41e1c07eecff9436b9bb27724822229b2da6073af8461ede6c81b508c0677c56 + md5: c1f91331274f591340e2f50e737dfbe9 depends: - bzip2 >=1.0.8,<2.0a0 - - libffi >=3.4.2,<3.5.0a0 - - libsqlite >=3.39.4,<4.0a0 - - libzlib >=1.2.13,<2.0.0a0 - - openssl >=3.0.5,<4.0a0 - - tk >=8.6.12,<8.7.0a0 + - libexpat >=2.7.0,<3.0a0 + - libffi >=3.4.6,<3.5.0a0 + - liblzma >=5.8.1,<6.0a0 + - libsqlite >=3.49.1,<4.0a0 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.0,<4.0a0 + - tk >=8.6.13,<8.7.0a0 - tzdata - - vc >=14.1,<15 - - vs2015_runtime >=14.16.27033 - - xz >=5.2.6,<5.3.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 constrains: - python_abi 3.11.* *_cp311 license: Python-2.0 purls: [] - size: 19819816 - timestamp: 1666678800085 + size: 18299489 + timestamp: 1744323460367 - pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl name: pyyaml version: 6.0.2 @@ -1724,16 +1851,6 @@ packages: - setuptools>=68 ; extra == 'test' - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_22.conda - sha256: 80aa9932203d65a96f817b8be4fafc176fb2b3fe6cf6899ede678b8f0317fbff - md5: 8c6b061d44cafdfc8e8c6eb5f100caf0 - depends: - - vc14_runtime >=14.40.33810 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 17453 - timestamp: 1728400827536 - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl name: wcwidth version: 0.2.13 @@ -1751,36 +1868,3 @@ packages: - pkg:pypi/wheel?source=hash-mapping size: 58585 timestamp: 1722797131787 -- conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 - sha256: 03a6d28ded42af8a347345f82f3eebdd6807a08526d47899a42d62d319609162 - md5: 2161070d867d1b1204ea749c8eec4ef0 - depends: - - libgcc-ng >=12 - license: LGPL-2.1 and GPL-2.0 - purls: [] - size: 418368 - timestamp: 1660346797927 -- conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 - sha256: eb09823f34cc2dd663c0ec4ab13f246f45dcd52e5b8c47b9864361de5204a1c8 - md5: a72f9d4ea13d55d745ff1ed594747f10 - license: LGPL-2.1 and GPL-2.0 - purls: [] - size: 238119 - timestamp: 1660346964847 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 - sha256: 59d78af0c3e071021cfe82dc40134c19dab8cdf804324b62940f5c8cd71803ec - md5: 39c6b54e94014701dd157f4f576ed211 - license: LGPL-2.1 and GPL-2.0 - purls: [] - size: 235693 - timestamp: 1660346961024 -- conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 - sha256: 54d9778f75a02723784dc63aff4126ff6e6749ba21d11a6d03c1f4775f269fe0 - md5: 515d77642eaa3639413c6b1bc3f94219 - depends: - - vc >=14.1,<15 - - vs2015_runtime >=14.16.27033 - license: LGPL-2.1 and GPL-2.0 - purls: [] - size: 217804 - timestamp: 1660346976440 diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index fa3c89b38d3..907468f2b2e 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -8,13 +8,8 @@ if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias from typing import ( Any, - Collection, Generic, Iterable, Iterator, @@ -48,7 +43,7 @@ from pyarrow.lib import ( from . import scalar, types from .device import DeviceAllocationType -from .scalar import Scalar +from .scalar import NullableCollection, Scalar from .types import ( DataType, Field, @@ -56,20 +51,15 @@ from .types import ( _AsPyType, _BasicDataType, _BasicValueT, - _DataType_CoT, _DataTypeT, _IndexT, _RunEndType, _Size, ) -_T = TypeVar("_T") - -NullableIterable: TypeAlias = Iterable[_T | None] - -@overload # type: ignore[overload-overlap] +@overload def array( - values: NullableIterable[bool], + values: NullableCollection[bool], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -79,7 +69,7 @@ def array( ) -> BooleanArray: ... @overload def array( - values: NullableIterable[int], + values: NullableCollection[int], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -89,7 +79,7 @@ def array( ) -> Int64Array: ... @overload def array( - values: NullableIterable[float], + values: NullableCollection[float], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -99,7 +89,7 @@ def array( ) -> DoubleArray: ... @overload def array( - values: NullableIterable[Decimal], + values: NullableCollection[Decimal], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -109,7 +99,7 @@ def array( ) -> Decimal128Array: ... @overload def array( - values: NullableIterable[dict[str, Any]], + values: NullableCollection[dict[str, Any]], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -119,7 +109,7 @@ def array( ) -> StructArray: ... @overload def array( - values: NullableIterable[dt.date], + values: NullableCollection[dt.date], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -129,27 +119,27 @@ def array( ) -> Date32Array: ... @overload def array( - values: NullableIterable[dt.time], + values: NullableCollection[dt.time], type: None = None, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Time64Array: ... +) -> Time64Array[Literal["us"]]: ... @overload def array( - values: NullableIterable[dt.timedelta], + values: NullableCollection[dt.timedelta], type: None = None, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> DurationArray: ... +) -> DurationArray[Literal["us"]]: ... @overload def array( - values: NullableIterable[MonthDayNano], + values: NullableCollection[MonthDayNano], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -159,7 +149,7 @@ def array( ) -> MonthDayNanoIntervalArray: ... @overload def array( - values: NullableIterable[str], + values: NullableCollection[str], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -169,7 +159,7 @@ def array( ) -> StringArray: ... @overload def array( - values: NullableIterable[bytes], + values: NullableCollection[bytes], type: None = None, mask: Mask | None = None, size: int | None = None, @@ -179,27 +169,27 @@ def array( ) -> BinaryArray: ... @overload def array( - values: NullableIterable[list], + values: NullableCollection[list[Any]], type: None = None, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> ListArray: ... +) -> ListArray[Any]: ... @overload def array( - values: NullableIterable[_Scalar_CoT], + values: NullableCollection[_ScalarT], type: None = None, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Array[_Scalar_CoT]: ... +) -> Array[_ScalarT]: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["null"] | types.NullType, mask: Mask | None = None, size: int | None = None, @@ -209,7 +199,7 @@ def array( ) -> NullArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["bool", "boolean"] | types.BoolType, mask: Mask | None = None, size: int | None = None, @@ -219,7 +209,7 @@ def array( ) -> BooleanArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i1", "int8"] | types.Int8Type, mask: Mask | None = None, size: int | None = None, @@ -229,7 +219,7 @@ def array( ) -> Int8Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i2", "int16"] | types.Int16Type, mask: Mask | None = None, size: int | None = None, @@ -239,7 +229,7 @@ def array( ) -> Int16Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i4", "int32"] | types.Int32Type, mask: Mask | None = None, size: int | None = None, @@ -249,7 +239,7 @@ def array( ) -> Int32Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i8", "int64"] | types.Int64Type, mask: Mask | None = None, size: int | None = None, @@ -259,8 +249,8 @@ def array( ) -> Int64Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.Int8Type, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, @@ -269,7 +259,7 @@ def array( ) -> UInt8Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u2", "uint16"] | types.UInt16Type, mask: Mask | None = None, size: int | None = None, @@ -279,7 +269,7 @@ def array( ) -> UInt16Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u4", "uint32"] | types.Uint32Type, mask: Mask | None = None, size: int | None = None, @@ -289,7 +279,7 @@ def array( ) -> UInt32Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u8", "uint64"] | types.UInt64Type, mask: Mask | None = None, size: int | None = None, @@ -299,7 +289,7 @@ def array( ) -> UInt64Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f2", "halffloat", "float16"] | types.Float16Type, mask: Mask | None = None, size: int | None = None, @@ -309,7 +299,7 @@ def array( ) -> HalfFloatArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f4", "float", "float32"] | types.Float32Type, mask: Mask | None = None, size: int | None = None, @@ -319,7 +309,7 @@ def array( ) -> FloatArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f8", "double", "float64"] | types.Float64Type, mask: Mask | None = None, size: int | None = None, @@ -329,7 +319,7 @@ def array( ) -> DoubleArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string", "str", "utf8"] | types.StringType, mask: Mask | None = None, size: int | None = None, @@ -339,7 +329,7 @@ def array( ) -> StringArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary"] | types.BinaryType, mask: Mask | None = None, size: int | None = None, @@ -349,7 +339,7 @@ def array( ) -> BinaryArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, mask: Mask | None = None, size: int | None = None, @@ -359,7 +349,7 @@ def array( ) -> LargeStringArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_binary"] | types.LargeBinaryType, mask: Mask | None = None, size: int | None = None, @@ -369,7 +359,7 @@ def array( ) -> LargeBinaryArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary_view"] | types.BinaryViewType, mask: Mask | None = None, size: int | None = None, @@ -379,7 +369,7 @@ def array( ) -> BinaryViewArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string_view"] | types.StringViewType, mask: Mask | None = None, size: int | None = None, @@ -389,7 +379,7 @@ def array( ) -> StringViewArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["date32", "date32[day]"] | types.Date32Type, mask: Mask | None = None, size: int | None = None, @@ -399,7 +389,7 @@ def array( ) -> Date32Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["date64", "date64[ms]"] | types.Date64Type, mask: Mask | None = None, size: int | None = None, @@ -409,48 +399,117 @@ def array( ) -> Date64Array: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]", "time32[ms]"] | types.Time32Type, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Time32Array: ... +) -> DurationArray[Literal["s"]]: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]", "time64[ns]"] | types.Time64Type, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> Time64Array: ... +) -> DurationArray[Literal["ms"]]: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]"] | types.TimestampType, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> TimestampArray: ... +) -> DurationArray[Literal["us"]]: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"] - | types.DurationType, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], mask: Mask | None = None, size: int | None = None, from_pandas: bool | None = None, safe: bool = True, memory_pool: MemoryPool | None = None, -) -> DurationArray: ... +) -> DurationArray[Literal["ns"]]: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, mask: Mask | None = None, size: int | None = None, @@ -460,7 +519,7 @@ def array( ) -> MonthDayNanoIntervalArray: ... @overload def array( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: _DataTypeT, mask: Mask | None = None, size: int | None = None, @@ -469,160 +528,198 @@ def array( memory_pool: MemoryPool | None = None, ) -> Array[Scalar[_DataTypeT]]: ... @overload -def asarray(values: NullableIterable[bool]) -> BooleanArray: ... +def asarray(values: NullableCollection[bool]) -> BooleanArray: ... @overload -def asarray(values: NullableIterable[int]) -> Int64Array: ... +def asarray(values: NullableCollection[int]) -> Int64Array: ... @overload -def asarray(values: NullableIterable[float]) -> DoubleArray: ... +def asarray(values: NullableCollection[float]) -> DoubleArray: ... @overload -def asarray(values: NullableIterable[Decimal]) -> Decimal128Array: ... +def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... @overload -def asarray(values: NullableIterable[dict[str, Any]]) -> StructArray: ... +def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... @overload -def asarray(values: NullableIterable[dt.date]) -> Date32Array: ... +def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... @overload -def asarray(values: NullableIterable[dt.time]) -> Time64Array: ... +def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... @overload -def asarray(values: NullableIterable[dt.timedelta]) -> DurationArray: ... +def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... @overload -def asarray(values: NullableIterable[MonthDayNano]) -> MonthDayNanoIntervalArray: ... +def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... @overload -def asarray(values: NullableIterable[list]) -> ListArray: ... +def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["null"] | types.NullType, ) -> NullArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["bool", "boolean"] | types.BoolType, ) -> BooleanArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i1", "int8"] | types.Int8Type, ) -> Int8Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i2", "int16"] | types.Int16Type, ) -> Int16Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i4", "int32"] | types.Int32Type, ) -> Int32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["i8", "int64"] | types.Int64Type, ) -> Int64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u1", "uint8"] | types.UInt8Type, ) -> UInt8Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u2", "uint16"] | types.UInt16Type, ) -> UInt16Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u4", "uint32"] | types.Uint32Type, ) -> UInt32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["u8", "uint64"] | types.UInt64Type, ) -> UInt64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f2", "halffloat", "float16"] | types.Float16Type, ) -> HalfFloatArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f4", "float", "float32"] | types.Float32Type, ) -> FloatArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["f8", "double", "float64"] | types.Float64Type, ) -> DoubleArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string", "str", "utf8"] | types.StringType, ) -> StringArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary"] | types.BinaryType, ) -> BinaryArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, ) -> LargeStringArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["large_binary"] | types.LargeBinaryType, ) -> LargeBinaryArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["binary_view"] | types.BinaryViewType, ) -> BinaryViewArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["string_view"] | types.StringViewType, ) -> StringViewArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["date32", "date32[day]"] | types.Date32Type, ) -> Date32Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["date64", "date64[ms]"] | types.Date64Type, ) -> Date64Array: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]", "time32[ms]"] | types.Time32Type, -) -> Time32Array: ... + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> Time32Array[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> Time32Array[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> Time64Array[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> Time64Array[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> TimestampArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> TimestampArray[Literal["ms"]]: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]", "time64[ns]"] | types.Time64Type, -) -> Time64Array: ... + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> TimestampArray[Literal["us"]]: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"] - | types.TimestampType, -) -> TimestampArray: ... + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> TimestampArray[Literal["ns"]]: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]", "duration[ms]", "duration[us]", "duration[ns]"] - | types.DurationType, -) -> DurationArray: ... + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> DurationArray[Literal["s"]]: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> DurationArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> DurationArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> DurationArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, ) -> MonthDayNanoIntervalArray: ... @overload def asarray( - values: Iterable | SupportArrowArray | SupportArrowDeviceArray, + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: _DataTypeT, ) -> Array[Scalar[_DataTypeT]]: ... @overload @@ -703,20 +800,22 @@ def nulls( ) -> Date64Array: ... @overload def nulls( - size: int, types: types.Time32Type, memory_pool: MemoryPool | None = None -) -> Time32Array: ... + size: int, types: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... @overload def nulls( - size: int, types: types.Time64Type, memory_pool: MemoryPool | None = None -) -> Time64Array: ... + size: int, types: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None +) -> Time64Array[types._Time64Unit]: ... @overload def nulls( - size: int, types: types.TimestampType, memory_pool: MemoryPool | None = None -) -> TimestampArray: ... + size: int, + types: types.TimestampType[types._Unit, types._Tz], + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... @overload def nulls( - size: int, types: types.DurationType, memory_pool: MemoryPool | None = None -) -> DurationArray: ... + size: int, types: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None +) -> DurationArray[types._Unit]: ... @overload def nulls( size: int, types: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None @@ -766,33 +865,33 @@ def nulls( @overload def nulls( size: int, - types: types.ListType[_DataTypeT], + types: types.LargeListType[_DataTypeT], memory_pool: MemoryPool | None = None, -) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +) -> LargeListArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.FixedSizeListType[_DataTypeT, _Size], + types: types.ListViewType[_DataTypeT], memory_pool: MemoryPool | None = None, -) -> FixedSizeListArray[_DataTypeT, _Size]: ... +) -> ListViewArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.LargeListType[_DataTypeT], + types: types.LargeListViewType[_DataTypeT], memory_pool: MemoryPool | None = None, -) -> LargeListArray[_DataTypeT]: ... +) -> LargeListViewArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.ListViewType[_DataTypeT], + types: types.FixedSizeListType[_DataTypeT, _Size], memory_pool: MemoryPool | None = None, -) -> ListViewArray[_DataTypeT]: ... +) -> FixedSizeListArray[_DataTypeT, _Size]: ... @overload def nulls( size: int, - types: types.LargeListViewType[_DataTypeT], + types: types.ListType[_DataTypeT], memory_pool: MemoryPool | None = None, -) -> LargeListViewArray[_DataTypeT]: ... +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... @overload def nulls( size: int, @@ -826,9 +925,9 @@ def nulls( @overload def nulls( size: int, - types: types.FixedShapeTensorType, + types: types.FixedShapeTensorType[types._ValueT], memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray: ... +) -> FixedShapeTensorArray[Any]: ... @overload def nulls( size: int, @@ -840,25 +939,25 @@ def nulls( size: int, types: types.UuidType, memory_pool: MemoryPool | None = None, -) -> UuidArray: ... +) -> UuidArray[Any]: ... @overload def nulls( size: int, types: types.JsonType, memory_pool: MemoryPool | None = None, -) -> JsonArray: ... +) -> JsonArray[Any]: ... @overload def nulls( size: int, types: types.OpaqueType, memory_pool: MemoryPool | None = None, -) -> OpaqueArray: ... +) -> OpaqueArray[Any]: ... @overload def nulls( size: int, types: types.ExtensionType, memory_pool: MemoryPool | None = None, -) -> ExtensionArray: ... +) -> ExtensionArray[Any]: ... @overload def repeat( value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None @@ -917,11 +1016,11 @@ def repeat( ) -> Decimal32Array: ... @overload def repeat( - value: Decimal | scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None + value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None ) -> Decimal64Array: ... @overload def repeat( - value: Decimal | scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None + value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @overload def repeat( @@ -937,22 +1036,28 @@ def repeat( ) -> Date64Array: ... @overload def repeat( - value: scalar.Time32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Time32Array: ... + value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... @overload def repeat( - value: dt.time | scalar.Time64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Time64Array: ... + value: dt.time | scalar.Time64Scalar[types._Time64Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> Time64Array[types._Time64Unit]: ... @overload def repeat( - value: scalar.TimestampScalar, size: int, memory_pool: MemoryPool | None = None -) -> TimestampArray: ... + value: scalar.TimestampScalar[types._Unit, types._Tz], + size: int, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... @overload def repeat( - value: dt.timedelta | scalar.DurationScalar, size: int, memory_pool: MemoryPool | None = None -) -> DurationArray: ... + value: dt.timedelta | scalar.DurationScalar[types._Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> DurationArray[types._Unit]: ... @overload -def repeat( # type: ignore[overload-overlap] +def repeat( # pyright: ignore[reportOverlappingOverload] value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, size: int, memory_pool: MemoryPool | None = None, @@ -1001,7 +1106,7 @@ def repeat( ) -> StringViewArray: ... @overload def repeat( - value: list | tuple | scalar.ListScalar[_DataTypeT], + value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], size: int, memory_pool: MemoryPool | None = None, ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... @@ -1064,7 +1169,7 @@ def repeat( value: scalar.FixedShapeTensorScalar, size: int, memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray: ... +) -> FixedShapeTensorArray[Any]: ... @overload def repeat( value: scalar.Bool8Scalar, @@ -1076,26 +1181,26 @@ def repeat( value: scalar.UuidScalar, size: int, memory_pool: MemoryPool | None = None, -) -> UuidArray: ... +) -> UuidArray[Any]: ... @overload def repeat( value: scalar.JsonScalar, size: int, memory_pool: MemoryPool | None = None, -) -> JsonArray: ... +) -> JsonArray[Any]: ... @overload def repeat( value: scalar.OpaqueScalar, size: int, memory_pool: MemoryPool | None = None, -) -> OpaqueArray: ... +) -> OpaqueArray[Any]: ... @overload def repeat( value: scalar.ExtensionScalar, size: int, memory_pool: MemoryPool | None = None, -) -> ExtensionArray: ... -def infer_type(values: Iterable, mask: Mask, from_pandas: bool = False) -> DataType: ... +) -> ExtensionArray[Any]: ... +def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: ... _ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) @@ -1122,9 +1227,8 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): _CastAs = TypeVar("_CastAs", bound=DataType) _ScalarT = TypeVar("_ScalarT", bound=Scalar) -_Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) -class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): +class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def diff(self, other: Self) -> str: ... def cast( self, @@ -1134,9 +1238,9 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): memory_pool: MemoryPool | None = None, ) -> Array[Scalar[_CastAs]]: ... def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... - def sum(self, **kwargs) -> _Scalar_CoT: ... + def sum(self, **kwargs) -> _ScalarT: ... @property - def type(self: Array[Scalar[_DataType_CoT]]) -> _DataType_CoT: ... + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... def unique(self) -> Self: ... def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... @overload @@ -1165,7 +1269,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): buffers: list[Buffer], null_count: int = -1, offset=0, - children: Collection[Array[Scalar[_DataTypeT]]] | None = None, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, ) -> Array[Scalar[_DataTypeT]]: ... @property def null_count(self) -> int: ... @@ -1173,7 +1277,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): def nbytes(self) -> int: ... def get_total_buffer_size(self) -> int: ... def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_Scalar_CoT]: ... + def __iter__(self) -> Iterator[_ScalarT]: ... def to_string( self, *, @@ -1193,7 +1297,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... @overload - def __getitem__(self, key: int) -> _Scalar_CoT: ... + def __getitem__(self, key: int) -> _ScalarT: ... @overload def __getitem__(self, key: slice) -> Self: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... @@ -1262,9 +1366,9 @@ class BooleanArray(Array[scalar.BooleanScalar]): @property def true_count(self) -> int: ... -class NumericArray(Array[_Scalar_CoT]): ... -class IntegerArray(NumericArray[_Scalar_CoT]): ... -class FloatingPointArray(NumericArray[_Scalar_CoT]): ... +class NumericArray(Array[_ScalarT]): ... +class IntegerArray(NumericArray[_ScalarT]): ... +class FloatingPointArray(NumericArray[_ScalarT]): ... class Int8Array(IntegerArray[scalar.Int8Scalar]): ... class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... class Int16Array(IntegerArray[scalar.Int16Scalar]): ... @@ -1275,10 +1379,10 @@ class Int64Array(IntegerArray[scalar.Int64Scalar]): ... class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... class Date32Array(NumericArray[scalar.Date32Scalar]): ... class Date64Array(NumericArray[scalar.Date64Scalar]): ... -class TimestampArray(NumericArray[scalar.TimestampScalar]): ... -class Time32Array(NumericArray[scalar.Time32Scalar]): ... -class Time64Array(NumericArray[scalar.Time64Scalar]): ... -class DurationArray(NumericArray[scalar.DurationScalar]): ... +class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... +class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... +class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... +class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... @@ -1289,12 +1393,12 @@ class Decimal64Array(FixedSizeBinaryArray): ... class Decimal128Array(FixedSizeBinaryArray): ... class Decimal256Array(FixedSizeBinaryArray): ... -class BaseListArray(Array[_Scalar_CoT]): +class BaseListArray(Array[_ScalarT]): def flatten(self, recursive: bool = False) -> Array: ... def value_parent_indices(self) -> Int64Array: ... def value_lengths(self) -> Int32Array: ... -class ListArray(BaseListArray[_Scalar_CoT]): +class ListArray(BaseListArray[_ScalarT]): @overload @classmethod def from_arrays( @@ -1377,7 +1481,7 @@ class ListArray(BaseListArray[_Scalar_CoT]): @property def offsets(self) -> Int32Array: ... -class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @overload @classmethod def from_arrays( @@ -1405,7 +1509,7 @@ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): @property def offsets(self) -> Int64Array: ... -class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataType_CoT]]): +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): @overload @classmethod def from_arrays( @@ -1435,7 +1539,7 @@ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataType_CoT]]): @property def sizes(self) -> Int32Array: ... -class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @overload @classmethod def from_arrays( @@ -1465,7 +1569,7 @@ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataType_CoT]]): @property def sizes(self) -> Int64Array: ... -class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataType_CoT, _Size]]): +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): @overload @classmethod def from_arrays( @@ -1486,13 +1590,13 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataType_CoT, mask: Mask | None = None, ) -> FixedSizeListArray[_DataTypeT, _Size]: ... @property - def values(self) -> BaseListArray[scalar.ListScalar[_DataType_CoT]]: ... + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: ... _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): - @overload # type: ignore[override] + @overload @classmethod def from_arrays( cls, @@ -1506,7 +1610,7 @@ class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): ) -> MapArray[_MapKeyT, _MapItemT]: ... @overload @classmethod - def from_arrays( + def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] cls, offsets: Int64Array, values: Array, @@ -1531,14 +1635,14 @@ class UnionArray(Array[scalar.UnionScalar]): def from_dense( types: Int8Array, value_offsets: Int32Array, - children: Collection[Array], + children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, ) -> UnionArray: ... @staticmethod def from_sparse( types: Int8Array, - children: Collection[Array], + children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, ) -> UnionArray: ... @@ -1641,7 +1745,7 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal type: DataType | None = None, ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... @staticmethod - def from_buffers( + def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] type: DataType, length: int, buffers: list[Buffer], diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi index 6400ac033fa..59942db594f 100644 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -127,7 +127,7 @@ class RecordBatchReader(_Weakrefable): self, ) -> Iterator[RecordBatchWithMetadata]: ... def read_all(self) -> Table: ... - read_pandas = _ReadPandasMixin.read_pandas + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] def close(self) -> None: ... def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @@ -160,7 +160,7 @@ class _RecordBatchFileReader(_Weakrefable): get_record_batch = get_batch def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... def read_all(self) -> Table: ... - read_pandas = _ReadPandasMixin.read_pandas + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @property diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index eea26f90770..98a33dbd1f9 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -12,18 +12,17 @@ if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -from typing import Any, Generic, Iterator, Mapping, overload +from typing import Any, Generic, Iterator, Literal, Mapping, overload import numpy as np from pyarrow._compute import CastOptions from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable -from typing_extensions import TypeVar +from typing_extensions import Protocol, TypeVar from . import types from .types import ( _AsPyType, - _DataType_CoT, _DataTypeT, _Time32Unit, _Time64Unit, @@ -34,9 +33,9 @@ from .types import ( _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") -class Scalar(_Weakrefable, Generic[_DataType_CoT]): +class Scalar(_Weakrefable, Generic[_DataTypeT]): @property - def type(self) -> _DataType_CoT: ... + def type(self) -> _DataTypeT: ... @property def is_valid(self) -> bool: ... @overload @@ -170,39 +169,39 @@ class BinaryViewScalar(Scalar[types.BinaryViewType]): class StringViewScalar(Scalar[types.StringViewType]): def as_buffer(self) -> Buffer: ... -class ListScalar(Scalar[types.ListType[_DataType_CoT]]): +class ListScalar(Scalar[types.ListType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataType_CoT, types._Size]]): +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListScalar(Scalar[types.LargeListType[_DataType_CoT]]): +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class ListViewScalar(Scalar[types.ListViewType[_DataType_CoT]]): +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... -class LargeListViewScalar(Scalar[types.LargeListViewType[_DataType_CoT]]): +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): @property def values(self) -> Array | None: ... def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataType_CoT]: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... def __iter__(self) -> Iterator[Array]: ... class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): @@ -266,50 +265,83 @@ class FixedShapeTensorScalar(ExtensionScalar): _V = TypeVar("_V") -CollectionValue: TypeAlias = list[_V | None] | tuple[_V | None, ...] | set[_V | None] +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... @overload def scalar( - value: str, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: str, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> StringScalar: ... @overload def scalar( - value: bytes, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: bytes, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> BinaryScalar: ... @overload -def scalar( - value: bool, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None +def scalar( # pyright: ignore[reportOverlappingOverload] + value: bool, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> BooleanScalar: ... @overload def scalar( - value: int, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: int, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> Int64Scalar: ... @overload def scalar( - value: float, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: float, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> DoubleScalar: ... @overload def scalar( - value: Decimal, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: Decimal, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> Decimal128Scalar: ... @overload -def scalar( - value: dt.datetime, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None -) -> TimestampScalar: ... +def scalar( # pyright: ignore[reportOverlappingOverload] + value: dt.datetime, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[Literal["us"]]: ... @overload def scalar( - value: dt.date, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None + value: dt.date, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, ) -> Date32Scalar: ... @overload def scalar( - value: dt.time, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None -) -> Time64Scalar: ... + value: dt.time, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[Literal["us"]]: ... @overload def scalar( - value: dt.timedelta, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None -) -> DurationScalar: ... + value: dt.timedelta, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[Literal["us"]]: ... @overload -def scalar( +def scalar( # pyright: ignore[reportOverlappingOverload] value: MonthDayNano, *, from_pandas: bool | None = None, @@ -324,98 +356,84 @@ def scalar( ) -> StructScalar: ... @overload def scalar( - value: CollectionValue[str], + value: NullableCollection[str], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.StringType]]: ... @overload def scalar( - value: CollectionValue[bytes], + value: NullableCollection[bytes], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.BinaryType]]: ... @overload def scalar( - value: CollectionValue[bool], + value: NullableCollection[bool], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.BoolType]]: ... @overload def scalar( - value: CollectionValue[int], + value: NullableCollection[int], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Int64Type]]: ... @overload def scalar( - value: CollectionValue[float], + value: NullableCollection[float], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Float64Type]]: ... @overload def scalar( - value: CollectionValue[Decimal], + value: NullableCollection[Decimal], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Decimal32Type]]: ... @overload def scalar( - value: CollectionValue[Decimal], + value: NullableCollection[dt.datetime], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Decimal64Type]]: ... +) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... @overload def scalar( - value: CollectionValue[Decimal], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Decimal128Type]]: ... -@overload -def scalar( - value: CollectionValue[dt.datetime], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.TimestampType]]: ... -@overload -def scalar( - value: CollectionValue[dt.date], + value: NullableCollection[dt.date], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.Date32Type]]: ... @overload def scalar( - value: CollectionValue[dt.time], + value: NullableCollection[dt.time], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Time32Type]]: ... +) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... @overload def scalar( - value: CollectionValue[dt.timedelta], + value: NullableCollection[dt.timedelta], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.DurationType]]: ... +) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... @overload def scalar( - value: CollectionValue[MonthDayNano], + value: NullableCollection[MonthDayNano], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... @overload def scalar( - value: CollectionValue, + value: NullableCollection[Any], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -695,11 +713,11 @@ def scalar( @overload def scalar( value: Any, - type: types.FixedSizeListType, + type: types.FixedSizeListType[_DataTypeT, types._Size], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> FixedSizeListScalar: ... +) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... @overload def scalar( value: Any, @@ -719,7 +737,7 @@ def scalar( @overload def scalar( value: Any, - type: types.StringType, + type: types.StructType, *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, @@ -735,11 +753,11 @@ def scalar( @overload def scalar( value: Any, - type: types.RunEndEncodedType, + type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], *, from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, -) -> RunEndEncodedScalar: ... +) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... @overload def scalar( value: Any, diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index c6a21cd67d9..a80241f44b9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -43,7 +43,6 @@ from pyarrow._stubs_typing import ( NullEncoding, NullSelectionBehavior, Order, - PyScalar, SupportArrowArray, SupportArrowDeviceArray, SupportArrowStream, @@ -52,16 +51,16 @@ from pyarrow.compute import ArrayOrChunkedArray, Expression from pyarrow.interchange.dataframe import _PyArrowDataFrame from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema -from . import scalar -from .array import Array, StructArray, _CastAs, _PandasConvertible +from . import array, scalar, types +from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible from .device import DeviceAllocationType from .io import Buffer from .ipc import RecordBatchReader from .scalar import Int64Scalar, Scalar from .tensor import Tensor -from .types import _AsPyType, _BasicDataType, _DataType_CoT, _DataTypeT +from .types import _AsPyType, _BasicDataType, _DataTypeT -_Scalar_CoT = TypeVar("_Scalar_CoT", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) _Aggregation: TypeAlias = Literal[ "all", @@ -119,11 +118,11 @@ NullarySelector: TypeAlias = tuple[()] NarySelector: TypeAlias = list[str] | tuple[str, ...] ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector -class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): @property def data(self) -> Self: ... @property - def type(self: ChunkedArray[Scalar[_DataType_CoT]]) -> _DataType_CoT: ... + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... def length(self) -> int: ... __len__ = length def to_string( @@ -145,7 +144,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @overload def __getitem__(self, key: slice) -> Self: ... @overload - def __getitem__(self, key: int) -> _Scalar_CoT: ... + def __getitem__(self, key: int) -> _ScalarT: ... def getitem(self, i: int) -> Scalar: ... def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: ... def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: ... @@ -167,8 +166,8 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): ) -> ChunkedArray[Scalar[_CastAs]]: ... def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_CoT]: ... - def unique(self) -> ChunkedArray[_Scalar_CoT]: ... + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_ScalarT]: ... + def unique(self) -> ChunkedArray[_ScalarT]: ... def value_counts(self) -> StructArray: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: ... def filter( @@ -198,11 +197,182 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... @property def num_chunks(self) -> int: ... - def chunk(self, i: int) -> ChunkedArray[_Scalar_CoT]: ... + def chunk(self, i: int) -> ChunkedArray[_ScalarT]: ... @property - def chunks(self) -> list[Array[_Scalar_CoT]]: ... - def iterchunks(self) -> Generator[Array[_Scalar_CoT], None, None]: ... - def __iter__(self) -> Iterator[_Scalar_CoT]: ... + def chunks(self) -> list[Array[_ScalarT]]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.NullScalar], + ) -> Generator[array.NullArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BooleanScalar], + ) -> Generator[array.BooleanArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt8Scalar], + ) -> Generator[array.UInt8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int8Scalar], + ) -> Generator[array.Int8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt16Scalar], + ) -> Generator[array.UInt16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int16Scalar], + ) -> Generator[array.Int16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt32Scalar], + ) -> Generator[array.UInt32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int32Scalar], + ) -> Generator[array.Int32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt64Scalar], + ) -> Generator[array.UInt64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int64Scalar], + ) -> Generator[array.Int64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.HalfFloatScalar], + ) -> Generator[array.HalfFloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FloatScalar], + ) -> Generator[array.FloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DoubleScalar], + ) -> Generator[array.DoubleArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal32Scalar], + ) -> Generator[array.Decimal32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal64Scalar], + ) -> Generator[array.Decimal64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal128Scalar], + ) -> Generator[array.Decimal128Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal256Scalar], + ) -> Generator[array.Decimal256Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date32Scalar], + ) -> Generator[array.Date32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date64Scalar], + ) -> Generator[array.Date64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], + ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], + ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DurationScalar[types._Unit]], + ) -> Generator[array.DurationArray[types._Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], + ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryScalar], + ) -> Generator[array.BinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeBinaryScalar], + ) -> Generator[array.LargeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeBinaryScalar], + ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringScalar], + ) -> Generator[array.StringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeStringScalar], + ) -> Generator[array.LargeStringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryViewScalar], + ) -> Generator[array.BinaryViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringViewScalar], + ) -> Generator[array.StringViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.ListScalar[_DataTypeT]], + ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], + ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], + ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], + ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StructScalar], + ) -> Generator[array.StructArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], + ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], + ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.RunEndEncodedScalar], + ) -> Generator[array.RunEndEncodedArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UnionScalar], + ) -> Generator[array.UnionArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Bool8Scalar], + ) -> Generator[array.Bool8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UuidScalar], + ) -> Generator[array.UuidArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.JsonScalar], + ) -> Generator[array.JsonArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.OpaqueScalar], + ) -> Generator[array.OpaqueArray, None, None]: ... + def __iter__(self) -> Iterator[_ScalarT]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], ) -> list[_AsPyType | None]: ... @@ -214,241 +384,247 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_CoT]): @overload def chunked_array( - values: Iterable[bool] | Iterable[int] | Iterable[float], + values: Iterable[NullableCollection[bool]], + type: None = None, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[int]], + type: None = None, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[float]], type: None = None, -) -> ( - ChunkedArray[scalar.BooleanScalar] - | ChunkedArray[scalar.Int64Scalar] - | ChunkedArray[scalar.DoubleScalar] -): ... +) -> ChunkedArray[scalar.DoubleScalar]: ... @overload def chunked_array( - values: Iterable[Decimal], + values: Iterable[NullableCollection[Decimal]], type: None = None, ) -> ChunkedArray[scalar.Decimal128Scalar]: ... @overload def chunked_array( - values: Iterable[dict[str, Any]], + values: Iterable[NullableCollection[dict[str, Any]]], type: None = None, ) -> ChunkedArray[scalar.StructScalar]: ... @overload def chunked_array( - values: Iterable[dt.datetime] | Iterable[dt.date], + values: Iterable[NullableCollection[dt.datetime]], type: None = None, -) -> ChunkedArray[scalar.TimestampScalar] | ChunkedArray[scalar.Date32Scalar]: ... +) -> ChunkedArray[scalar.TimestampScalar]: ... @overload def chunked_array( - values: Iterable[dt.time], + values: Iterable[NullableCollection[dt.date]], type: None = None, -) -> ChunkedArray[scalar.Time64Scalar]: ... +) -> ChunkedArray[scalar.Date32Scalar]: ... @overload def chunked_array( - values: Iterable[dt.timedelta], + values: Iterable[NullableCollection[dt.time]], type: None = None, -) -> ChunkedArray[scalar.DurationScalar]: ... +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... @overload def chunked_array( - values: Iterable[MonthDayNano], + values: Iterable[NullableCollection[dt.timedelta]], type: None = None, -) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... @overload def chunked_array( - values: Iterable[str], + values: Iterable[NullableCollection[MonthDayNano]], type: None = None, -) -> ChunkedArray[scalar.StringScalar]: ... +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... @overload def chunked_array( - values: Iterable[bytearray], + values: Iterable[NullableCollection[str]], type: None = None, -) -> ChunkedArray[scalar.BinaryScalar]: ... +) -> ChunkedArray[scalar.StringScalar]: ... @overload def chunked_array( - values: Iterable[list], + values: Iterable[NullableCollection[bytes]], type: None = None, -) -> ChunkedArray[scalar.ListScalar]: ... +) -> ChunkedArray[scalar.BinaryScalar]: ... @overload def chunked_array( - values: Iterable[_Scalar_CoT] | Iterable[Array[_Scalar_CoT]], + values: Iterable[NullableCollection[list[Any]]], type: None = None, -) -> ChunkedArray[_Scalar_CoT]: ... +) -> ChunkedArray[scalar.ListScalar[Any]]: ... @overload def chunked_array( - values: Iterable[PyScalar | None], + values: Iterable[Array[_ScalarT]], type: None = None, -) -> ChunkedArray[Any]: ... +) -> ChunkedArray[_ScalarT]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: _DataTypeT, ) -> ChunkedArray[Scalar[_DataTypeT]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["null"], ) -> ChunkedArray[scalar.NullScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["bool", "boolean"], ) -> ChunkedArray[scalar.BooleanScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["i1", "int8"], ) -> ChunkedArray[scalar.Int8Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["i2", "int16"], ) -> ChunkedArray[scalar.Int16Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["i4", "int32"], ) -> ChunkedArray[scalar.Int32Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["i8", "int64"], ) -> ChunkedArray[scalar.Int64Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["u1", "uint8"], ) -> ChunkedArray[scalar.UInt8Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["u2", "uint16"], ) -> ChunkedArray[scalar.UInt16Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["u4", "uint32"], ) -> ChunkedArray[scalar.UInt32Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["u8", "uint64"], ) -> ChunkedArray[scalar.UInt64Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["f2", "halffloat", "float16"], ) -> ChunkedArray[scalar.HalfFloatScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["f4", "float", "float32"], ) -> ChunkedArray[scalar.FloatScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["f8", "double", "float64"], ) -> ChunkedArray[scalar.DoubleScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["string", "str", "utf8"], ) -> ChunkedArray[scalar.StringScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["binary"], ) -> ChunkedArray[scalar.BinaryScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["large_string", "large_str", "large_utf8"], ) -> ChunkedArray[scalar.LargeStringScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["large_binary"], ) -> ChunkedArray[scalar.LargeBinaryScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["binary_view"], ) -> ChunkedArray[scalar.BinaryViewScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["string_view"], ) -> ChunkedArray[scalar.StringViewScalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["date32", "date32[day]"], ) -> ChunkedArray[scalar.Date32Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["date64", "date64[ms]"], ) -> ChunkedArray[scalar.Date64Scalar]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["time32[s]"], ) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["time32[ms]"], ) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["time64[us]"], ) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["time64[ns]"], ) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["timestamp[s]"], ) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["timestamp[ms]"], ) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["timestamp[us]"], ) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["timestamp[ns]"], ) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["duration[s]"], ) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["duration[ms]"], ) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["duration[us]"], ) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], type: Literal["duration[ns]"], ) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... @overload def chunked_array( - values: Iterable | SupportArrowStream | SupportArrowArray, + values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, type: Literal["month_day_nano_interval"], ) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... @@ -669,20 +845,20 @@ class Table(_Tabular[ChunkedArray[Any]]): def is_cpu(self) -> bool: ... def record_batch( - data: dict[str, list | Array] - | Collection[Array] + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] | pd.DataFrame | SupportArrowArray | SupportArrowDeviceArray, names: list[str] | None = None, schema: Schema | None = None, - metadata: Mapping | None = None, + metadata: Mapping[Any, Any] | None = None, ) -> RecordBatch: ... @overload def table( - data: dict[str, list | Array], + data: dict[str, list[Any] | Array[Any]], schema: Schema | None = None, - metadata: Mapping | None = None, + metadata: Mapping[Any, Any] | None = None, nthreads: int | None = None, ) -> Table: ... @overload @@ -694,14 +870,14 @@ def table( | SupportArrowDeviceArray, names: list[str] | None = None, schema: Schema | None = None, - metadata: Mapping | None = None, + metadata: Mapping[Any, Any] | None = None, nthreads: int | None = None, ) -> Table: ... def concat_tables( tables: Iterable[Table], memory_pool: MemoryPool | None = None, promote_options: Literal["none", "default", "permissive"] = "none", - **kwargs, + **kwargs: Any, ) -> Table: ... class TableGroupBy: diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 8118cb96309..998dca59f63 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -30,7 +30,6 @@ from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") _DataTypeT = TypeVar("_DataTypeT", bound=DataType) -_DataType_CoT = TypeVar("_DataType_CoT", bound=DataType, covariant=True) class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -81,7 +80,7 @@ class BinaryType(_BasicDataType[bytes]): ... class LargeBinaryType(_BasicDataType[bytes]): ... class BinaryViewType(_BasicDataType[bytes]): ... -_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"]) +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) _Tz = TypeVar("_Tz", str, None, default=None) class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): @@ -108,8 +107,8 @@ class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): class FixedSizeBinaryType(_BasicDataType[Decimal]): ... -_Precision = TypeVar("_Precision") -_Scale = TypeVar("_Scale") +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): @property @@ -135,17 +134,35 @@ class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): @property def scale(self) -> _Scale: ... -class ListType(DataType, Generic[_DataType_CoT]): +class ListType(DataType, Generic[_DataTypeT]): @property - def value_field(self) -> Field[_DataType_CoT]: ... + def value_field(self) -> Field[_DataTypeT]: ... @property - def value_type(self) -> _DataType_CoT: ... + def value_type(self) -> _DataTypeT: ... -class LargeListType(ListType[_DataType_CoT]): ... -class ListViewType(ListType[_DataType_CoT]): ... -class LargeListViewType(ListType[_DataType_CoT]): ... +class LargeListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + +class ListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... -class FixedSizeListType(ListType[_DataType_CoT], Generic[_DataType_CoT, _Size]): +class LargeListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... @property def list_size(self) -> _Size: ... @@ -196,7 +213,7 @@ class StructType(DataType): def get_all_field_indices(self, name: str) -> list[int]: ... def __len__(self) -> int: ... def __iter__(self) -> Iterator[Field]: ... - __getitem__ = field + __getitem__ = field # pyright: ignore[reportUnknownVariableType] @property def names(self) -> list[str]: ... @property @@ -210,7 +227,7 @@ class UnionType(DataType): def __len__(self) -> int: ... def __iter__(self) -> Iterator[Field]: ... def field(self, i: int) -> Field: ... - __getitem__ = field + __getitem__ = field # pyright: ignore[reportUnknownVariableType] class SparseUnionType(UnionType): @property @@ -290,7 +307,7 @@ def ensure_metadata( meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False ) -> KeyValueMetadata | None: ... -class Field(_Weakrefable, Generic[_DataType_CoT]): +class Field(_Weakrefable, Generic[_DataTypeT]): def equals(self, other: Field, check_metadata: bool = False) -> bool: ... def __hash__(self) -> int: ... @property @@ -300,12 +317,12 @@ class Field(_Weakrefable, Generic[_DataType_CoT]): @property def metadata(self) -> dict[bytes, bytes] | None: ... @property - def type(self) -> _DataType_CoT: ... + def type(self) -> _DataTypeT: ... def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... def remove_metadata(self) -> None: ... def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... def with_name(self, name: str) -> Self: ... - def with_nullable(self, nullable: bool) -> Field[_DataType_CoT]: ... + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... def flatten(self) -> list[Field]: ... def _export_to_c(self, out_ptr: int) -> None: ... @classmethod @@ -317,7 +334,7 @@ class Field(_Weakrefable, Generic[_DataType_CoT]): class Schema(_Weakrefable): def __len__(self) -> int: ... def __getitem__(self, key: str) -> Field: ... - _field = __getitem__ + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] def __iter__(self) -> Iterator[Field]: ... def __hash__(self) -> int: ... def __sizeof__(self) -> int: ... @@ -362,10 +379,10 @@ def unify_schemas( schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" ) -> Schema: ... @overload -def field(name: SupportArrowSchema) -> Field: ... +def field(name: SupportArrowSchema) -> Field[Any]: ... @overload def field( - name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict | None = None + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None ) -> Field[_DataTypeT]: ... def null() -> NullType: ... def bool_() -> BoolType: ... @@ -451,21 +468,22 @@ def dictionary( index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered ) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... def struct( - fields: Iterable[Field | tuple[str, Field] | tuple[str, DataType]] | Mapping[str, Field], + fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] + | Mapping[str, Field[Any]], ) -> StructType: ... def sparse_union( - child_fields: list[Field], type_codes: list[int] | None = None + child_fields: list[Field[Any]], type_codes: list[int] | None = None ) -> SparseUnionType: ... def dense_union( - child_fields: list[Field], type_codes: list[int] | None = None + child_fields: list[Field[Any]], type_codes: list[int] | None = None ) -> DenseUnionType: ... @overload def union( - child_fields: list[Field], mode: Literal["sparse"], type_codes: list[int] | None = None + child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None ) -> SparseUnionType: ... @overload def union( - child_fields: list[Field], mode: Literal["dense"], type_codes: list[int] | None = None + child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None ) -> DenseUnionType: ... def run_end_encoded( run_end_type: _RunEndType, value_type: _BasicValueT @@ -625,10 +643,10 @@ def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... @overload def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... def schema( - fields: Iterable[Field] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], + fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], metadata: dict[bytes | str, bytes | str] | None = None, ) -> Schema: ... -def from_numpy_dtype(dtype: np.dtype) -> DataType: ... +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: ... def is_boolean_value(obj: Any) -> bool: ... def is_integer_value(obj: Any) -> bool: ... def is_float_value(obj: Any) -> bool: ... diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index c2f71110134..45c8dcf5485 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import IO, Callable, Literal +from typing import IO, Any, Callable, Literal from _typeshed import StrPath @@ -67,7 +67,7 @@ class CSVWriter(lib._CRecordBatchWriter): def __init__( self, # TODO: OutputStream - sink: StrPath | IO, + sink: StrPath | IO[Any], schema: lib.Schema, write_options: WriteOptions | None = None, *, @@ -79,14 +79,14 @@ class CSVStreamingReader(lib.RecordBatchReader): ... ISO8601: lib._Weakrefable def open_csv( - input_file: StrPath | IO, + input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> CSVStreamingReader: ... def read_csv( - input_file: StrPath | IO, + input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, @@ -94,7 +94,7 @@ def read_csv( ) -> lib.Table: ... def write_csv( data: lib.RecordBatch | lib.Table, - output_file: StrPath | lib.NativeFile | IO, + output_file: StrPath | lib.NativeFile | IO[Any], write_options: WriteOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index 30329416731..88d5a59d232 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -1,4 +1,4 @@ -from typing import IO, Literal +from typing import IO, Any, Literal from _typeshed import StrPath @@ -23,7 +23,7 @@ class ParseOptions(_Weakrefable): def equals(self, other: ParseOptions) -> bool: ... def read_json( - input_file: StrPath | IO, + input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 3408d77b9c0..db149f3b56f 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -141,10 +141,10 @@ ListScalar: TypeAlias = ( TemporalScalar: TypeAlias = ( lib.Date32Scalar | lib.Date64Scalar - | lib.Time32Scalar - | lib.Time64Scalar - | lib.TimestampScalar - | lib.DurationScalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] | lib.MonthDayNanoIntervalScalar ) NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar @@ -231,7 +231,7 @@ def first( memory_pool: lib.MemoryPool | None = None, ) -> _ScalarT: ... def first_last( - array: lib.Array | lib.ChunkedArray, + array: lib.Array[Any] | lib.ChunkedArray[Any], /, *, skip_nulls: bool = True, @@ -240,7 +240,7 @@ def first_last( memory_pool: lib.MemoryPool | None = None, ) -> lib.StructScalar: ... def index( - data: lib.Array | lib.ChunkedArray, + data: lib.Array[Any] | lib.ChunkedArray[Any], value, start: int | None = None, end: int | None = None, @@ -920,8 +920,16 @@ def logb( ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... @overload def logb( - x: FloatScalar | FloatArray, - b: FloatScalar | FloatArray, + x: FloatScalar, + b: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatArray, + b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None, @@ -956,15 +964,27 @@ def atan2( ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... @overload def atan2( - y: FloatScalar | FloatArray, - x: FloatScalar | FloatArray, + y: FloatArray, + x: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatScalar, + x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... @overload def atan2( - y: Expression | Any, x: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None + y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def atan2( + y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... # ========================= 2.5 Comparisons functions ========================= @@ -2232,19 +2252,19 @@ day_of_year = _clone_signature(day) @overload def hour( - values: lib.TimestampScalar | lib.Time32Scalar | lib.Time64Scalar, + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], /, *, memory_pool: lib.MemoryPool | None = None, ) -> lib.Int64Scalar: ... @overload def hour( - values: lib.TimestampArray - | lib.Time32Array - | lib.Time64Array - | lib.ChunkedArray[lib.TimestampScalar] - | lib.ChunkedArray[lib.Time32Scalar] - | lib.ChunkedArray[lib.Time64Scalar], + values: lib.TimestampArray[Any] + | lib.Time32Array[Any] + | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]], /, *, memory_pool: lib.MemoryPool | None = None, @@ -2258,11 +2278,11 @@ def hour( ) -> Expression: ... @overload def is_dst( - values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.BooleanScalar: ... @overload def is_dst( - values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], /, *, memory_pool: lib.MemoryPool | None = None, @@ -2271,11 +2291,11 @@ def is_dst( def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... @overload def iso_week( - values: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Scalar: ... @overload def iso_week( - values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], /, *, memory_pool: lib.MemoryPool | None = None, @@ -2289,7 +2309,7 @@ iso_year = _clone_signature(iso_week) @overload def is_leap_year( - values: lib.TimestampScalar | lib.Date32Scalar | lib.Date64Scalar, + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, /, *, memory_pool: lib.MemoryPool | None = None, diff --git a/pyproject.toml b/pyproject.toml index 345f08935bd..a43dd3515d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,5 +86,12 @@ lines-between-types = 1 docstring-code-format = true [tool.pyright] -typeCheckingMode = "basic" -reportMissingImports = false +typeCheckingMode = "strict" +reportMissingImports = false +reportPrivateUsage = false +reportUnknownParameterType = false +reportMissingTypeArgument = false +reportMissingParameterType = false +reportMissingTypeStubs = false +reportUnknownVariableType = false +reportUnknownArgumentType = false From dfa7f01a562096d35f611b3e5b8c52482d6a4290 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 14 May 2025 11:23:29 +0800 Subject: [PATCH 198/231] fix: pa.nulls accept type rather than types (#234) --- pyarrow-stubs/__lib_pxi/array.pyi | 90 +++++++++++++++---------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 907468f2b2e..3f1b22188e2 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -736,226 +736,226 @@ def nulls( def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... @overload def nulls( - size: int, types: types.Int16Type, memory_pool: MemoryPool | None = None + size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None ) -> Int16Array: ... @overload def nulls( - size: int, types: types.Int32Type, memory_pool: MemoryPool | None = None + size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None ) -> Int32Array: ... @overload def nulls( - size: int, types: types.Int64Type, memory_pool: MemoryPool | None = None + size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None ) -> Int64Array: ... @overload def nulls( - size: int, types: types.UInt8Type, memory_pool: MemoryPool | None = None + size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None ) -> UInt8Array: ... @overload def nulls( - size: int, types: types.UInt16Type, memory_pool: MemoryPool | None = None + size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None ) -> UInt16Array: ... @overload def nulls( - size: int, types: types.Uint32Type, memory_pool: MemoryPool | None = None + size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None ) -> UInt32Array: ... @overload def nulls( - size: int, types: types.UInt64Type, memory_pool: MemoryPool | None = None + size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None ) -> UInt64Array: ... @overload def nulls( - size: int, types: types.Float16Type, memory_pool: MemoryPool | None = None + size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None ) -> HalfFloatArray: ... @overload def nulls( - size: int, types: types.Float32Type, memory_pool: MemoryPool | None = None + size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None ) -> FloatArray: ... @overload def nulls( - size: int, types: types.Float64Type, memory_pool: MemoryPool | None = None + size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None ) -> DoubleArray: ... @overload def nulls( - size: int, types: types.Decimal32Type, memory_pool: MemoryPool | None = None + size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @overload def nulls( - size: int, types: types.Decimal64Type, memory_pool: MemoryPool | None = None + size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @overload def nulls( - size: int, types: types.Decimal128Type, memory_pool: MemoryPool | None = None + size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None ) -> Decimal128Array: ... @overload def nulls( - size: int, types: types.Decimal256Type, memory_pool: MemoryPool | None = None + size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None ) -> Decimal256Array: ... @overload def nulls( - size: int, types: types.Date32Type, memory_pool: MemoryPool | None = None + size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None ) -> Date32Array: ... @overload def nulls( - size: int, types: types.Date64Type, memory_pool: MemoryPool | None = None + size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None ) -> Date64Array: ... @overload def nulls( - size: int, types: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None + size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None ) -> Time32Array[types._Time32Unit]: ... @overload def nulls( - size: int, types: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None + size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None ) -> Time64Array[types._Time64Unit]: ... @overload def nulls( size: int, - types: types.TimestampType[types._Unit, types._Tz], + type: types.TimestampType[types._Unit, types._Tz], memory_pool: MemoryPool | None = None, ) -> TimestampArray[types._Unit, types._Tz]: ... @overload def nulls( - size: int, types: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None + size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None ) -> DurationArray[types._Unit]: ... @overload def nulls( - size: int, types: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None + size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None ) -> MonthDayNanoIntervalArray: ... @overload def nulls( size: int, - types: types.BinaryType, + type: types.BinaryType, memory_pool: MemoryPool | None = None, ) -> BinaryArray: ... @overload def nulls( size: int, - types: types.LargeBinaryType, + type: types.LargeBinaryType, memory_pool: MemoryPool | None = None, ) -> LargeBinaryArray: ... @overload def nulls( size: int, - types: types.FixedSizeBinaryType, + type: types.FixedSizeBinaryType, memory_pool: MemoryPool | None = None, ) -> FixedSizeBinaryArray: ... @overload def nulls( size: int, - types: types.StringType, + type: types.StringType, memory_pool: MemoryPool | None = None, ) -> StringArray: ... @overload def nulls( size: int, - types: types.LargeStringType, + type: types.LargeStringType, memory_pool: MemoryPool | None = None, ) -> LargeStringArray: ... @overload def nulls( size: int, - types: types.BinaryViewType, + type: types.BinaryViewType, memory_pool: MemoryPool | None = None, ) -> BinaryViewArray: ... @overload def nulls( size: int, - types: types.StringViewType, + type: types.StringViewType, memory_pool: MemoryPool | None = None, ) -> StringViewArray: ... @overload def nulls( size: int, - types: types.LargeListType[_DataTypeT], + type: types.LargeListType[_DataTypeT], memory_pool: MemoryPool | None = None, ) -> LargeListArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.ListViewType[_DataTypeT], + type: types.ListViewType[_DataTypeT], memory_pool: MemoryPool | None = None, ) -> ListViewArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.LargeListViewType[_DataTypeT], + type: types.LargeListViewType[_DataTypeT], memory_pool: MemoryPool | None = None, ) -> LargeListViewArray[_DataTypeT]: ... @overload def nulls( size: int, - types: types.FixedSizeListType[_DataTypeT, _Size], + type: types.FixedSizeListType[_DataTypeT, _Size], memory_pool: MemoryPool | None = None, ) -> FixedSizeListArray[_DataTypeT, _Size]: ... @overload def nulls( size: int, - types: types.ListType[_DataTypeT], + type: types.ListType[_DataTypeT], memory_pool: MemoryPool | None = None, ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... @overload def nulls( size: int, - types: types.StructType, + type: types.StructType, memory_pool: MemoryPool | None = None, ) -> StructArray: ... @overload def nulls( size: int, - types: types.MapType[_MapKeyT, _MapItemT], + type: types.MapType[_MapKeyT, _MapItemT], memory_pool: MemoryPool | None = None, ) -> MapArray[_MapKeyT, _MapItemT]: ... @overload def nulls( size: int, - types: types.DictionaryType[_IndexT, _BasicValueT], + type: types.DictionaryType[_IndexT, _BasicValueT], memory_pool: MemoryPool | None = None, ) -> DictionaryArray[_IndexT, _BasicValueT]: ... @overload def nulls( size: int, - types: types.RunEndEncodedType[_RunEndType, _BasicValueT], + type: types.RunEndEncodedType[_RunEndType, _BasicValueT], memory_pool: MemoryPool | None = None, ) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... @overload def nulls( size: int, - types: types.UnionType, + type: types.UnionType, memory_pool: MemoryPool | None = None, ) -> UnionArray: ... @overload def nulls( size: int, - types: types.FixedShapeTensorType[types._ValueT], + type: types.FixedShapeTensorType[types._ValueT], memory_pool: MemoryPool | None = None, ) -> FixedShapeTensorArray[Any]: ... @overload def nulls( size: int, - types: types.Bool8Type, + type: types.Bool8Type, memory_pool: MemoryPool | None = None, ) -> Bool8Array: ... @overload def nulls( size: int, - types: types.UuidType, + type: types.UuidType, memory_pool: MemoryPool | None = None, ) -> UuidArray[Any]: ... @overload def nulls( size: int, - types: types.JsonType, + type: types.JsonType, memory_pool: MemoryPool | None = None, ) -> JsonArray[Any]: ... @overload def nulls( size: int, - types: types.OpaqueType, + type: types.OpaqueType, memory_pool: MemoryPool | None = None, ) -> OpaqueArray[Any]: ... @overload def nulls( size: int, - types: types.ExtensionType, + type: types.ExtensionType, memory_pool: MemoryPool | None = None, ) -> ExtensionArray[Any]: ... @overload @@ -1633,7 +1633,7 @@ class UnionArray(Array[scalar.UnionScalar]): def offsets(self) -> Int32Array: ... @staticmethod def from_dense( - types: Int8Array, + type: Int8Array, value_offsets: Int32Array, children: NullableCollection[Array], field_names: list[str] | None = None, From ec06c82e936d7c34bcd8f0719906f111cd7e665c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 May 2025 11:42:40 +0800 Subject: [PATCH 199/231] [pre-commit.ci] pre-commit autoupdate (#232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.8 → v0.11.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.8...v0.11.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index efe8a1ce63b..3ff731e6bd2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.8 + rev: v0.11.9 hooks: - id: ruff args: [--fix] From 2f5b7ac8b88f20c2d4a1d15c0e684fca8697463a Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 14 May 2025 11:47:37 +0800 Subject: [PATCH 200/231] release 19.4 (#235) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 93a2e46d8a2..ed53fa4dc61 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1281,8 +1281,8 @@ packages: requires_python: '>=3.9' - pypi: . name: pyarrow-stubs - version: '19.3' - sha256: 58bce9ff799d39ea412fbed515fab1f8d7d309f9705b3333d8336113cdb4dc98 + version: '19.4' + sha256: 338b3b60807fd5604cb6a1a9bf356415c1512eae1bc909708ef82d22bffa7b3a requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index a43dd3515d8..3dc1e444036 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "19.3" +version = "19.4" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 9f7a6e69e67ffcfdd2b203ad74efbe545356fb58 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 16 Jun 2025 16:54:50 +0800 Subject: [PATCH 201/231] lint(pyright): disable reportUnknownMemberType (#239) --- pixi.lock | 20 ++++++++++++++------ pyproject.toml | 1 + 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pixi.lock b/pixi.lock index ed53fa4dc61..626e992f322 100644 --- a/pixi.lock +++ b/pixi.lock @@ -77,7 +77,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: . + - pypi: ./ osx-64: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda @@ -140,7 +140,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: . + - pypi: ./ osx-arm64: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda @@ -203,7 +203,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: . + - pypi: ./ win-64: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda @@ -266,7 +266,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: . + - pypi: ./ packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 @@ -1279,10 +1279,18 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.9' -- pypi: . +- pypi: ./ name: pyarrow-stubs version: '19.4' - sha256: 338b3b60807fd5604cb6a1a9bf356415c1512eae1bc909708ef82d22bffa7b3a + sha256: 43d52f4a9c4c975ed69bd0328043b4aca400da21acdf572ef52060d2dcd02c6b + requires_dist: + - pyarrow>=19 + requires_python: '>=3.9,<4' + editable: true +- pypi: ./ + name: pyarrow-stubs + version: '19.4' + sha256: 493abbc3b9396572b19f85fa4fe562690bcba5cd5326cf86cd84b9f70cdb58d3 requires_dist: - pyarrow>=19 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 3dc1e444036..bc4792cdfef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,3 +95,4 @@ reportMissingParameterType = false reportMissingTypeStubs = false reportUnknownVariableType = false reportUnknownArgumentType = false +reportUnknownMemberType = false From 816d663e5ca92b1b47f6f73633bf1a3dfbf55051 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:23:55 +0800 Subject: [PATCH 202/231] [pre-commit.ci] pre-commit autoupdate (#236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.9 → v0.11.13](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.9...v0.11.13) - [github.com/RobertCraigie/pyright-python: v1.1.400 → v1.1.401](https://github.com/RobertCraigie/pyright-python/compare/v1.1.400...v1.1.401) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3ff731e6bd2..27de3b9376e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,13 +21,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.9 + rev: v0.11.13 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.400 + rev: v1.1.401 hooks: - id: pyright From b2bd7bd977344e52e121abb55d6e0d5882cc7c83 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Mon, 16 Jun 2025 18:29:04 +0800 Subject: [PATCH 203/231] feat: support pyarrow 20.0 (#240) --- pixi.lock | 46 ++++++++++--------------- pyarrow-stubs/__lib_pxi/array.pyi | 18 ++++++++++ pyarrow-stubs/__lib_pxi/memory.pyi | 3 ++ pyarrow-stubs/__lib_pxi/pandas_shim.pyi | 1 + pyarrow-stubs/__lib_pxi/scalar.pyi | 28 +++++++++++++-- pyarrow-stubs/__lib_pxi/table.pyi | 10 ++++-- pyarrow-stubs/_azurefs.pyi | 1 + pyarrow-stubs/_compute.pyi | 27 +++++++++++++++ pyarrow-stubs/_dataset.pyi | 16 +++++++++ pyarrow-stubs/_json.pyi | 10 +++++- pyarrow-stubs/compute.pyi | 5 +++ pyarrow-stubs/json.pyi | 4 +-- pyarrow-stubs/parquet/core.pyi | 3 -- pyproject.toml | 4 +-- 14 files changed, 137 insertions(+), 39 deletions(-) diff --git a/pixi.lock b/pixi.lock index 626e992f322..0b0727d2698 100644 --- a/pixi.lock +++ b/pixi.lock @@ -60,7 +60,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl @@ -123,7 +123,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl @@ -186,7 +186,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl @@ -249,7 +249,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl @@ -1235,10 +1235,10 @@ packages: sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 requires_dist: - pytest ; extra == 'tests' -- pypi: https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl +- pypi: https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl name: pyarrow - version: 19.0.1 - sha256: 7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00 + version: 20.0.0 + sha256: a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62 requires_dist: - pytest ; extra == 'test' - hypothesis ; extra == 'test' @@ -1246,10 +1246,10 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl +- pypi: https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl name: pyarrow - version: 19.0.1 - sha256: cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90 + version: 20.0.0 + sha256: 24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0 requires_dist: - pytest ; extra == 'test' - hypothesis ; extra == 'test' @@ -1257,10 +1257,10 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl +- pypi: https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl name: pyarrow - version: 19.0.1 - sha256: 49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6 + version: 20.0.0 + sha256: 3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc requires_dist: - pytest ; extra == 'test' - hypothesis ; extra == 'test' @@ -1268,10 +1268,10 @@ packages: - pytz ; extra == 'test' - pandas ; extra == 'test' requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl +- pypi: https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl name: pyarrow - version: 19.0.1 - sha256: 008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466 + version: 20.0.0 + sha256: 95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb requires_dist: - pytest ; extra == 'test' - hypothesis ; extra == 'test' @@ -1281,18 +1281,10 @@ packages: requires_python: '>=3.9' - pypi: ./ name: pyarrow-stubs - version: '19.4' - sha256: 43d52f4a9c4c975ed69bd0328043b4aca400da21acdf572ef52060d2dcd02c6b + version: 20.0.0b0 + sha256: 2a604d3e2c38a73f0276bef7204d9fa8d07fc66f4c1fbcf2185b47b48feed1cb requires_dist: - - pyarrow>=19 - requires_python: '>=3.9,<4' - editable: true -- pypi: ./ - name: pyarrow-stubs - version: '19.4' - sha256: 493abbc3b9396572b19f85fa4fe562690bcba5cd5326cf86cd84b9f70cdb58d3 - requires_dist: - - pyarrow>=19 + - pyarrow>=20 requires_python: '>=3.9,<4' editable: true - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 3f1b22188e2..0da18a5b4d6 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1202,6 +1202,20 @@ def repeat( ) -> ExtensionArray[Any]: ... def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: ... +class ArrayStatistics(_Weakrefable): + @property + def null_count(self) -> int: ... + @property + def distinct_count(self) -> int: ... + @property + def min(self) -> Any: ... + @property + def is_min_exact(self) -> bool: ... + @property + def max(self) -> Any: ... + @property + def is_max_exact(self) -> bool: ... + _ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): @@ -1332,6 +1346,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: ... def to_pylist( self: Array[Scalar[_BasicDataType[_AsPyType]]], + *, + map_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[_AsPyType | None]: ... tolist = to_pylist def validate(self, *, full: bool = False) -> None: ... @@ -1357,6 +1373,8 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): def device_type(self) -> DeviceAllocationType: ... @property def is_cpu(self) -> bool: ... + @property + def statistics(self) -> ArrayStatistics | None: ... class NullArray(Array[scalar.NullScalar]): ... diff --git a/pyarrow-stubs/__lib_pxi/memory.pyi b/pyarrow-stubs/__lib_pxi/memory.pyi index cf98e88c9ae..755e689b5f0 100644 --- a/pyarrow-stubs/__lib_pxi/memory.pyi +++ b/pyarrow-stubs/__lib_pxi/memory.pyi @@ -3,7 +3,10 @@ from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): def release_unused(self) -> None: ... def bytes_allocated(self) -> int: ... + def total_bytes_allocated(self) -> int: ... def max_memory(self) -> int | None: ... + def num_allocations(self) -> int: ... + def print_stats(self) -> None: ... @property def backend_name(self) -> str: ... diff --git a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi index 1eb7cdd9687..0e80fae4ebf 100644 --- a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi +++ b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi @@ -25,6 +25,7 @@ class _PandasAPIShim: def version(self) -> str: ... def is_v1(self) -> bool: ... def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... def is_ge_v3(self) -> bool: ... @property def categorical_type(self) -> type[pd.Categorical]: ... diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 98a33dbd1f9..e9f4ca02e27 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -58,10 +58,16 @@ class Scalar(_Weakrefable, Generic[_DataTypeT]): def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @overload - def as_py(self: Scalar[types._BasicDataType[_AsPyType]]) -> _AsPyType: ... + def as_py( + self: Scalar[types._BasicDataType[_AsPyType]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> _AsPyType: ... @overload def as_py( self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[_AsPyType]: ... @overload def as_py( @@ -70,37 +76,55 @@ class Scalar(_Weakrefable, Generic[_DataTypeT]): types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] ] ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[dict[int, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[ types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[dict[Any, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[dict[int, Any]]: ... @overload def as_py( self: Scalar[types.StructType], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[dict[str, Any]]: ... @overload def as_py( self: Scalar[ types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[tuple[Any, _AsPyTypeV]]: ... @overload def as_py( self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[tuple[_AsPyTypeK, Any]]: ... @overload - def as_py(self: Scalar[Any]) -> Any: ... + def as_py( + self: Scalar[Any], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> Any: ... _NULL: TypeAlias = None NA = _NULL diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index a80241f44b9..67e19286add 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -375,6 +375,8 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def __iter__(self) -> Iterator[_ScalarT]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> list[_AsPyType | None]: ... def __arrow_c_stream__(self, requested_schema=None) -> Any: ... @classmethod @@ -677,8 +679,12 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def filter( self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" ) -> Self: ... - def to_pydict(self) -> dict[str, list]: ... - def to_pylist(self) -> list[dict[str, Any]]: ... + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list]: ... + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: ... def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... def remove_column(self, i: int) -> Self: ... def drop_columns(self, columns: str | list[str]) -> Self: ... diff --git a/pyarrow-stubs/_azurefs.pyi b/pyarrow-stubs/_azurefs.pyi index acce68a29d1..ee4529e8f01 100644 --- a/pyarrow-stubs/_azurefs.pyi +++ b/pyarrow-stubs/_azurefs.pyi @@ -11,4 +11,5 @@ class AzureFileSystem(FileSystem): dfs_storage_authority: str | None = None, blob_storage_schema: Literal["http", "https"] = "https", dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index 73f6a1af27b..dce71343ef4 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -120,6 +120,9 @@ class ElementWiseAggregateOptions(FunctionOptions): class ExtractRegexOptions(FunctionOptions): def __init__(self, pattern: str) -> None: ... +class ExtractRegexSpanOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + class FilterOptions(FunctionOptions): def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... @@ -179,6 +182,9 @@ class PairwiseOptions(FunctionOptions): class PartitionNthOptions(FunctionOptions): def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... +class WinsorizeOptions(FunctionOptions): + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + class QuantileOptions(FunctionOptions): def __init__( self, @@ -201,6 +207,22 @@ class RankOptions(FunctionOptions): tiebreaker: Literal["min", "max", "first", "dense"] = "first", ) -> None: ... +class RankQuantileOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + +class PivotWiderOptions(FunctionOptions): + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + class ReplaceSliceOptions(FunctionOptions): def __init__(self, start: int, stop: int, replacement: str) -> None: ... @@ -324,6 +346,11 @@ class Utf8NormalizeOptions(FunctionOptions): class VarianceOptions(FunctionOptions): def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... +class SkewOptions(FunctionOptions): + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + class WeekOptions(FunctionOptions): def __init__( self, diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index ebc450f4886..593c0abc64b 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -39,6 +39,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... def to_batches( @@ -50,6 +51,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Iterator[lib.RecordBatch]: ... def to_table( @@ -61,6 +63,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def take( @@ -73,6 +76,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def head( @@ -85,6 +89,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def count_rows( @@ -96,6 +101,7 @@ class Dataset(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> int: ... @property @@ -196,6 +202,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... def to_batches( @@ -207,6 +214,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Iterator[lib.RecordBatch]: ... def to_table( @@ -218,6 +226,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def take( @@ -230,6 +239,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def head( @@ -242,6 +252,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> lib.Table: ... def count_rows( @@ -253,6 +264,7 @@ class Fragment(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> int: ... @@ -450,6 +462,7 @@ class Scanner(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... @staticmethod @@ -464,6 +477,7 @@ class Scanner(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... @overload @@ -479,6 +493,7 @@ class Scanner(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... @overload @@ -493,6 +508,7 @@ class Scanner(lib._Weakrefable): fragment_readahead: int = 4, fragment_scan_options: FragmentScanOptions | None = None, use_threads: bool = True, + cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... @property diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index 88d5a59d232..ce5b3a103dc 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -2,7 +2,7 @@ from typing import IO, Any, Literal from _typeshed import StrPath -from .lib import MemoryPool, Schema, Table, _Weakrefable +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable class ReadOptions(_Weakrefable): use_threads: bool @@ -22,9 +22,17 @@ class ParseOptions(_Weakrefable): ): ... def equals(self, other: ParseOptions) -> bool: ... +class JSONStreamingReader(RecordBatchReader): ... + def read_json( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, ) -> Table: ... +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: ... diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index db149f3b56f..3eb0aec9a2d 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -16,6 +16,7 @@ from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregate # Expressions from pyarrow._compute import Expression as Expression from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions from pyarrow._compute import FilterOptions as FilterOptions from pyarrow._compute import Function as Function from pyarrow._compute import FunctionOptions as FunctionOptions @@ -35,9 +36,11 @@ from pyarrow._compute import NullOptions as NullOptions from pyarrow._compute import PadOptions as PadOptions from pyarrow._compute import PairwiseOptions as PairwiseOptions from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions from pyarrow._compute import QuantileOptions as QuantileOptions from pyarrow._compute import RandomOptions as RandomOptions from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions @@ -52,6 +55,7 @@ from pyarrow._compute import ScalarFunction as ScalarFunction from pyarrow._compute import ScalarKernel as ScalarKernel from pyarrow._compute import SelectKOptions as SelectKOptions from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions from pyarrow._compute import SliceOptions as SliceOptions from pyarrow._compute import SortOptions as SortOptions from pyarrow._compute import SplitOptions as SplitOptions @@ -68,6 +72,7 @@ from pyarrow._compute import VarianceOptions as VarianceOptions from pyarrow._compute import VectorFunction as VectorFunction from pyarrow._compute import VectorKernel as VectorKernel from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions # Functions from pyarrow._compute import call_function as call_function diff --git a/pyarrow-stubs/json.pyi b/pyarrow-stubs/json.pyi index 0a1957e18af..db1d35e0b8b 100644 --- a/pyarrow-stubs/json.pyi +++ b/pyarrow-stubs/json.pyi @@ -1,3 +1,3 @@ -from pyarrow._json import ParseOptions, ReadOptions, read_json +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json -__all__ = ["ParseOptions", "ReadOptions", "read_json"] +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index d3ac8f9f976..8d665416066 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -198,7 +198,6 @@ class ParquetDataset: thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, page_checksum_verification: bool = False, - use_legacy_dataset: bool | None = None, ): ... def equals(self, other: ParquetDataset) -> bool: ... @property @@ -232,7 +231,6 @@ def read_table( partitioning: str | list[str] | Partitioning | None = "hive", filesystem: SupportedFileSystem | None = None, filters: Expression | FilterTuple | list[FilterTuple] | None = None, - use_legacy_dataset: bool | None = None, ignore_prefixes: list[str] | None = None, pre_buffer: bool = True, coerce_int96_timestamp_unit: str | None = None, @@ -278,7 +276,6 @@ def write_to_dataset( root_path: str | Path, partition_cols: list[str] | None = None, filesystem: SupportedFileSystem | None = None, - use_legacy_dataset: bool | None = None, schema: Schema | None = None, partitioning: Partitioning | list[str] | None = None, basename_template: str | None = None, diff --git a/pyproject.toml b/pyproject.toml index bc4792cdfef..e88896cffcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "19.4" +version = "20.0.0b0" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" @@ -18,7 +18,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] requires-python = ">=3.9,<4" -dependencies = ["pyarrow >=19"] +dependencies = ["pyarrow >=20"] [project.urls] homepage = "https://github.com/zen-xu/pyarrow-stubs" From 8aed7c8cfa5c2f129d95e1f44d485d3e051bf07c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:09:22 +0800 Subject: [PATCH 204/231] [pre-commit.ci] pre-commit autoupdate (#241) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/RobertCraigie/pyright-python: v1.1.401 → v1.1.402](https://github.com/RobertCraigie/pyright-python/compare/v1.1.401...v1.1.402) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 27de3b9376e..f0b1130198d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,6 +28,6 @@ repos: - id: ruff-format - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.401 + rev: v1.1.402 hooks: - id: pyright From b95cfafe77e6082e61b71c530766e8798dfcffe0 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 18 Jun 2025 10:13:03 +0800 Subject: [PATCH 205/231] support docstring (#242) * doc: complete tensor doc * doc: complete table doc * doc: complete scalar doc * doc: complete orc doc * doc: complete memory doc * doc: complete lib doc * doc: complete json doc * doc: complete hdfs doc * doc: complete gcsfs doc * doc: complete fs doc * doc: complete flight doc * doc: complete dataset doc * doc: complete dataset parquet doc * doc: complete dataset parquet encryption doc * doc: complete cuda doc * doc: complete csv doc * doc: complete azurefs doc * doc: complete core doc * doc: complete interchange doc * doc: complete array doc * doc: complete builder doc * doc: complete device doc * doc: complete io doc * doc: complete ipc doc * doc: complete types doc * mark deprecated apis * doc: complete _compute doc * doc: complete compute doc * doc: update compute doc * lint code --- pixi.lock | 2 +- pyarrow-stubs/__lib_pxi/array.pyi | 2584 ++++++++- pyarrow-stubs/__lib_pxi/builder.pyi | 76 +- pyarrow-stubs/__lib_pxi/device.pyi | 63 +- pyarrow-stubs/__lib_pxi/io.pyi | 1201 +++- pyarrow-stubs/__lib_pxi/ipc.pyi | 573 +- pyarrow-stubs/__lib_pxi/memory.pyi | 167 +- pyarrow-stubs/__lib_pxi/scalar.pyi | 150 +- pyarrow-stubs/__lib_pxi/table.pyi | 5010 ++++++++++++++++- pyarrow-stubs/__lib_pxi/tensor.pyi | 620 +- pyarrow-stubs/__lib_pxi/types.pyi | 4028 ++++++++++++- pyarrow-stubs/_azurefs.pyi | 59 + pyarrow-stubs/_compute.pyi | 1322 ++++- pyarrow-stubs/_csv.pyi | 547 +- pyarrow-stubs/_cuda.pyi | 532 +- pyarrow-stubs/_dataset.pyi | 1855 +++++- pyarrow-stubs/_dataset_parquet.pyi | 197 +- pyarrow-stubs/_dataset_parquet_encryption.pyi | 52 + pyarrow-stubs/_flight.pyi | 1132 +++- pyarrow-stubs/_fs.pyi | 930 ++- pyarrow-stubs/_gcsfs.pyi | 63 +- pyarrow-stubs/_hdfs.pyi | 58 +- pyarrow-stubs/_json.pyi | 141 +- pyarrow-stubs/compute.pyi | 4991 +++++++++++++++- pyarrow-stubs/interchange/buffer.pyi | 44 +- pyarrow-stubs/interchange/column.pyi | 212 +- pyarrow-stubs/interchange/dataframe.pyi | 96 +- pyarrow-stubs/interchange/from_dataframe.pyi | 215 +- pyarrow-stubs/lib.pyi | 42 +- pyarrow-stubs/orc.pyi | 226 +- pyarrow-stubs/parquet/core.pyi | 1813 +++++- pyproject.toml | 1 + 32 files changed, 27964 insertions(+), 1038 deletions(-) diff --git a/pixi.lock b/pixi.lock index 0b0727d2698..8413cfd18f8 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1282,7 +1282,7 @@ packages: - pypi: ./ name: pyarrow-stubs version: 20.0.0b0 - sha256: 2a604d3e2c38a73f0276bef7204d9fa8d07fc66f4c1fbcf2185b47b48feed1cb + sha256: 4ae4d2484afd306b8d131ce4bed1faa48493d8ace8b43b731b811b4e4e6bd2e2 requires_dist: - pyarrow>=20 requires_python: '>=3.9,<4' diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index 0da18a5b4d6..a6152610241 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -40,6 +40,7 @@ from pyarrow.lib import ( Tensor, _Weakrefable, ) +from typing_extensions import deprecated from . import scalar, types from .device import DeviceAllocationType @@ -527,6 +528,110 @@ def array( safe: bool = True, memory_pool: MemoryPool | None = None, ) -> Array[Scalar[_DataTypeT]]: ... +def array(*args, **kawrgs): + """ + Create pyarrow.Array instance from a Python object. + + Parameters + ---------- + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + @overload def asarray(values: NullableCollection[bool]) -> BooleanArray: ... @overload @@ -722,6 +827,25 @@ def asarray( values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, type: _DataTypeT, ) -> Array[Scalar[_DataTypeT]]: ... +def asarray(*args, **kwargs): + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + @overload def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... @overload @@ -958,6 +1082,40 @@ def nulls( type: types.ExtensionType, memory_pool: MemoryPool | None = None, ) -> ExtensionArray[Any]: ... +def nulls(*args, **kwargs): + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + @overload def repeat( value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None @@ -1200,21 +1358,131 @@ def repeat( size: int, memory_pool: MemoryPool | None = None, ) -> ExtensionArray[Any]: ... -def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: ... +def repeat(*args, **kwargs): + """ + Create an Array instance whose slots are the given scalar. + + Parameters + ---------- + value : Scalar-like object + Either a pyarrow.Scalar or any python object coercible to a Scalar. + size : int + Number of times to repeat the scalar in the output Array. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.repeat(10, 3) + + [ + 10, + 10, + 10 + ] + + >>> pa.repeat([1, 2], 2) + + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + + >>> pa.repeat("string", 3) + + [ + "string", + "string", + "string" + ] + + >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 1 + ] + -- child 1 type: list + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + """ + +def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: + """ + Attempt to infer Arrow data type that can hold the passed Python + sequence type in an Array object + + Parameters + ---------- + values : array-like + Sequence to infer type from. + mask : ndarray (bool type), optional + Optional exclusion mask where True marks null, False non-null. + from_pandas : bool, default False + Use pandas's NA/null sentinel values for type inference. + + Returns + ------- + type : DataType + """ class ArrayStatistics(_Weakrefable): + """ + The class for statistics of an array. + """ @property - def null_count(self) -> int: ... + def null_count(self) -> int: + """ + The number of nulls. + """ @property - def distinct_count(self) -> int: ... + def distinct_count(self) -> int: + """ + The number of distinct values. + """ @property - def min(self) -> Any: ... + def min(self) -> Any: + """ + The minimum value. + """ @property - def is_min_exact(self) -> bool: ... + def is_min_exact(self) -> bool: + """ + Whether the minimum value is an exact value or not. + """ @property - def max(self) -> Any: ... + def max(self) -> Any: + """ + The maximum value. + """ + @property - def is_max_exact(self) -> bool: ... + def is_max_exact(self) -> bool: + """ + Whether the maximum value is an exact value or not. + """ _ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) @@ -1237,26 +1505,288 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, coerce_temporal_nanoseconds: bool = False, - ) -> _ConvertAs: ... + ) -> _ConvertAs: + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + categories : list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures. + strings_to_categorical : bool, default False + Encode string (UTF8) and binary types to pandas.Categorical. + zero_copy_only : bool, default False + Raise an ArrowException if this function call would require copying + the underlying data. + integer_object_nulls : bool, default False + Cast integers with nulls to objects + date_as_object : bool, default True + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. + timestamp_as_object : bool, default False + Cast non-nanosecond timestamps (np.datetime64) to objects. This is + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. + use_threads : bool, default True + Whether to parallelize the conversion using multiple threads. + deduplicate_objects : bool, default True + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata : bool, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + safe : bool, default True + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks : bool, default False + If True, generate one internal "block" for each column when + creating a pandas.DataFrame from a RecordBatch or Table. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct : bool, default False + EXPERIMENTAL: If True, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling to_pandas with this option it will crash your + program. + + Note that you may not see always memory usage improvements. For + example, if multiple columns share an underlying allocation, + memory can't be freed until all columns are converted. + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. + types_mapper : function, default None + A function mapping a pyarrow DataType to a pandas ExtensionDtype. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of pandas_metadata in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas ExtensionDtype or ``None`` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). + + Returns + ------- + pandas.Series or pandas.DataFrame depending on type of object + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + + Convert a Table to pandas DataFrame: + + >>> table = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> table.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(table.to_pandas(), pd.DataFrame) + True + + Convert a RecordBatch to pandas DataFrame: + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(batch.to_pandas(), pd.DataFrame) + True + + Convert a Chunked Array to pandas Series: + + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_pandas() + 0 2 + 1 2 + 2 4 + 3 4 + 4 5 + 5 100 + dtype: int64 + >>> isinstance(n_legs.to_pandas(), pd.Series) + True + """ _CastAs = TypeVar("_CastAs", bound=DataType) _ScalarT = TypeVar("_ScalarT", bound=Scalar) class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): - def diff(self, other: Self) -> str: ... + """ + The base class for all Arrow arrays. + """ + + def diff(self, other: Self) -> str: + """ + Compare contents of this array against another one. + + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> import pyarrow as pa + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) # doctest: +SKIP + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + """ def cast( self, target_type: _CastAs, safe: bool = True, options: CastOptions | None = None, memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_CastAs]]: ... - def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... - def sum(self, **kwargs) -> _ScalarT: ... + ) -> Array[Scalar[_CastAs]]: + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + cast : Array + """ + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: + """ + Return zero-copy "view" of array as another data type. + + The data types must have compatible columnar buffer layouts + + Parameters + ---------- + target_type : DataType + Type to construct view as. + + Returns + ------- + view : Array + """ + def sum(self, **kwargs) -> _ScalarT: + """ + Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. + """ @property def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... - def unique(self) -> Self: ... - def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + def unique(self) -> Self: + """ + Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. + """ + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. + """ + def value_count(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + StructArray + An array of structs + """ @overload @staticmethod def from_pandas( @@ -1277,6 +1807,40 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): memory_pool: MemoryPool | None = None, ) -> Array[Scalar]: ... @staticmethod + def from_pandas(*args, **kwargs): + """ + Convert pandas.Series to an Arrow Array. + + This method uses Pandas semantics about what values indicate + nulls. See pyarrow.array for more general conversion from arrays or + sequences to Arrow arrays. + + Parameters + ---------- + obj : ndarray, pandas.Series, array-like + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + ChunkedArray is returned if object data overflows binary buffer. + """ + @staticmethod def from_buffers( type: _DataTypeT, length: int, @@ -1284,12 +1848,63 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): null_count: int = -1, offset=0, children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, - ) -> Array[Scalar[_DataTypeT]]: ... + ) -> Array[Scalar[_DataTypeT]]: + """ + Construct an Array from a sequence of buffers. + + The concrete type returned depends on the datatype. + + Parameters + ---------- + type : DataType + The value type of the array. + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array], default None + Nested type children with length matching type.num_fields. + + Returns + ------- + array : Array + """ @property def null_count(self) -> int: ... @property - def nbytes(self) -> int: ... - def get_total_buffer_size(self) -> int: ... + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the array. + + In other words, the sum of bytes from all buffer + ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + """ def __sizeof__(self) -> int: ... def __iter__(self) -> Iterator[_ScalarT]: ... def to_string( @@ -1300,29 +1915,156 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): window: int = 10, container_window: int = 2, skip_new_lines: bool = False, - ) -> str: ... + ) -> str: + """ + Render a "pretty-printed" string representation of the Array. + + Note: for data on a non-CPU device, the full array is copied to CPU + memory. + + Parameters + ---------- + indent : int, default 2 + How much to indent the internal items in the string to + the right, by default ``2``. + top_level_indent : int, default 0 + How much to indent right the entire content of the array, + by default ``0``. + window : int + How many primitive items to preview at the begin and end + of the array when the array is bigger than the window. + The other items will be ellipsed. + container_window : int + How many container items (such as a list in a list array) + to preview at the begin and end of the array when the array + is bigger than the window. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + """ format = to_string def equals(self, other: Self) -> bool: ... def __len__(self) -> int: ... - def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... - def is_nan(self) -> BooleanArray: ... - def is_valid(self) -> BooleanArray: ... + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: + """ + Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array + """ + def is_nan(self) -> BooleanArray: + """ + Return BooleanArray indicating the NaN values. + + Returns + ------- + array : boolean Array + """ + def is_valid(self) -> BooleanArray: + """ + Return BooleanArray indicating the non-null values. + """ def fill_null( self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType - ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: + """ + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. + """ @overload def __getitem__(self, key: int) -> _ScalarT: ... @overload def __getitem__(self, key: slice) -> Self: ... - def slice(self, offset: int = 0, length: int | None = None) -> Self: ... - def take(self, indices: Indices) -> Self: ... - def drop_null(self) -> Self: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or Array (slice) + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this array. + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice. + length : int, default None + Length of slice (default is until end of Array starting from + offset). + + Returns + ------- + sliced : Array + An array with the same datatype, containing the sliced values. + """ + def take(self, indices: Indices) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. + """ + def drop_null(self) -> Self: + """ + Remove missing values from an array. + """ def filter( self, mask: Mask, *, null_selection_behavior: Literal["drop", "emit_null"] = "drop", - ) -> Self: ... + ) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. + """ @overload def index( self: Array[_ScalarT], @@ -1341,40 +2083,311 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): *, memory_pool: MemoryPool | None = None, ) -> scalar.Int64Scalar: ... - def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: + """ + Return a NumPy view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for primitive arrays with the same memory layout as NumPy + (i.e. integers, floating point, ..) and without any nulls. + + For the extension arrays, this method simply delegates to the + underlying storage array. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ def to_pylist( self: Array[Scalar[_BasicDataType[_AsPyType]]], *, map_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: ... + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + lst : list + """ tolist = to_pylist - def validate(self, *, full: bool = False) -> None: ... + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ @property - def offset(self) -> int: ... - def buffers(self) -> list[Buffer | None]: ... - def copy_to(self, destination: MemoryManager | Device) -> Self: ... - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + def offset(self) -> int: + """ + A relative position into another array's data. + + The purpose is to enable zero-copy slicing. This value defaults to zero + but must be applied on all operations with the physical storage + buffers. + """ + def buffers(self) -> list[Buffer | None]: + """ + Return a list of Buffer objects pointing to this array's physical + storage. + + To correctly interpret these buffers, you need to also apply the offset + multiplied with the size of the stored data type. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Construct a copy of the array with all buffers on destination + device. + + This method recursively copies the array's buffers and those of its + children onto the destination MemoryManager device and returns the + new Array. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + Array + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ @classmethod - def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... - def __arrow_c_array__(self, requested_schema=None) -> Any: ... + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ @classmethod def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ @classmethod - def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... - def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: + """ + Import Array from a C ArrowDeviceArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ @classmethod def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def __dlpack__(self, stream: int | None = None) -> Any: ... - def __dlpack_device__(self) -> tuple[int, int]: ... + def __dlpack__(self, stream: int | None = None) -> Any: + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + def __dlpack_device__(self) -> tuple[int, int]: + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ @property - def device_type(self) -> DeviceAllocationType: ... + def device_type(self) -> DeviceAllocationType: + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether the array is CPU-accessible. + """ @property - def statistics(self) -> ArrayStatistics | None: ... + def statistics(self) -> ArrayStatistics | None: + """ + Statistics of the array. + """ class NullArray(Array[scalar.NullScalar]): ... @@ -1494,10 +2507,153 @@ class ListArray(BaseListArray[_ScalarT]): pool: MemoryPool | None = None, mask: Mask | None = None, ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListArray from arrays of int32 offsets and values. + + Parameters + ---------- + offsets : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : ListArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ @property - def values(self) -> Array: ... + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + ListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + """ @property - def offsets(self) -> Int32Array: ... + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, 5]]) + >>> array.offsets + + [ + 0, + 2, + 2, + 5 + ] + """ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @overload @@ -1522,10 +2678,109 @@ class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): pool: MemoryPool | None = None, mask: Mask | None = None, ) -> LargeListArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : LargeListArray + """ @property - def values(self) -> Array: ... + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from the sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, 4, None, 6]], + ... type=pa.large_list(pa.int32()), + ... ) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ @property - def offsets(self) -> Int64Array: ... + def offsets(self) -> Int64Array: + """ + Return the list offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + """ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): @overload @@ -1550,12 +2805,188 @@ class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): pool: MemoryPool | None = None, mask: Mask | None = None, ) -> ListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ @property - def values(self) -> Array: ... + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ @property - def offsets(self) -> Int32Array: ... + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ @property - def sizes(self) -> Int32Array: ... + def sizes(self) -> Int32Array: + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): @overload @@ -1580,12 +3011,195 @@ class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): pool: MemoryPool | None = None, mask: Mask | None = None, ) -> LargeListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ @property - def values(self) -> Array: ... + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ @property - def offsets(self) -> Int64Array: ... + def offsets(self) -> Int64Array: + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ @property - def sizes(self) -> Int64Array: ... + def sizes(self) -> Int64Array: + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): @overload @@ -1607,8 +3221,100 @@ class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _S type: None = None, mask: Mask | None = None, ) -> FixedSizeListArray[_DataTypeT, _Size]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct FixedSizeListArray from array of values and a list length. + + Parameters + ---------- + values : Array (any type) + list_size : int + The fixed length of the lists. + type : DataType, optional + If not specified, a default ListType with the values' type and + `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + + Returns + ------- + FixedSizeListArray + + Examples + -------- + + Create from a values array and a list size: + + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + + Or create from a values array, list size and matching type: + + >>> typ = pa.list_(pa.field("values", pa.int64()), 2) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ @property - def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: ... + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: + """ + Return the underlying array of values which backs the + FixedSizeListArray. + + Note even null elements are included. + + Compare with :meth:`flatten`, which returns only the non-null + sub-list values. + + Returns + ------- + values : Array + + See Also + -------- + FixedSizeListArray.flatten : ... + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) + >>> array.values + + [ + 1, + 2, + null, + null, + 3, + null + ] + + """ _MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) _MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) @@ -1637,18 +3343,153 @@ class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): pool: MemoryPool | None = None, mask: Mask | None = None, ) -> MapArray[_MapKeyT, _MapItemT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """ + Construct MapArray from arrays of int32 offsets and key, item arrays. + + Parameters + ---------- + offsets : array-like or sequence (int32 type) + keys : array-like or sequence (any type) + items : array-like or sequence (any type) + type : DataType, optional + If not specified, a default MapArray with the keys' and items' type is used. + pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + map_array : MapArray + + Examples + -------- + First, let's understand the structure of our dataset when viewed in a rectangular data model. + The total of 5 respondents answered the question "How much did you like the movie x?". + The value -1 in the integer array means that the value is missing. The boolean array + represents the null bitmask corresponding to the missing values in the integer array. + + >>> import pyarrow as pa + >>> movies_rectangular = np.ma.masked_array( + ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ], + ... ) + + To represent the same data with the MapArray and from_arrays, the data is + formed like this: + + >>> offsets = [ + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end + ... ] + >>> movies = [ + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", + ... "Meet the Parents", + ... "Superman", # -- row 2 + ... "Meet the Parents", + ... "Superman", # ----------------- row 3 + ... ] + >>> likings = [ + ... 10, # -------- row 1 + ... 8, + ... 4, + ... 5, # --- row 2 + ... 10, + ... 3, # ------ row 3 + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 [] + 4 [] + dtype: object + + If the data in the empty rows needs to be marked as missing, it's possible + to do so by modifying the offsets argument, so that we specify `None` as + the starting positions of the rows we want marked as missing. The end row + offset still has to refer to the existing value from keys (and values): + + >>> offsets = [ + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 None + 4 None + dtype: object + """ @property - def keys(self) -> Array: ... + def keys(self) -> Array: + """Flattened array of keys across all maps in array""" @property - def items(self) -> Array: ... + def items(self) -> Array: + """Flattened array of items across all maps in array""" class UnionArray(Array[scalar.UnionScalar]): - def child(self, pos: int) -> Field: ... - def field(self, pos: int) -> Array: ... + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: + """ + DEPRECATED, use field() instead. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : pyarrow.Field + The given child field. + """ + def field(self, pos: int) -> Array: + """ + Return the given child field as an individual array. + + For sparse unions, the returned array has its offset, length, + and null count adjusted. + + For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. + """ @property - def type_codes(self) -> Int8Array: ... + def type_codes(self) -> Int8Array: + """Get the type codes array.""" @property - def offsets(self) -> Int32Array: ... + def offsets(self) -> Int32Array: + """ + Get the value offsets array (dense arrays only). + + Does not account for any slice offset. + """ @staticmethod def from_dense( type: Int8Array, @@ -1656,14 +3497,45 @@ class UnionArray(Array[scalar.UnionScalar]): children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, - ) -> UnionArray: ... + ) -> UnionArray: + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ @staticmethod def from_sparse( types: Int8Array, children: NullableCollection[Array], field_names: list[str] | None = None, type_codes: Int8Array | None = None, - ) -> UnionArray: ... + ) -> UnionArray: + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ class StringArray(Array[scalar.StringScalar]): @staticmethod @@ -1674,7 +3546,25 @@ class StringArray(Array[scalar.StringScalar]): null_bitmap: Buffer | None = None, null_count: int | None = -1, offset: int | None = 0, - ) -> StringArray: ... + ) -> StringArray: + """ + Construct a StringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ class LargeStringArray(Array[scalar.LargeStringScalar]): @staticmethod @@ -1685,23 +3575,52 @@ class LargeStringArray(Array[scalar.LargeStringScalar]): null_bitmap: Buffer | None = None, null_count: int | None = -1, offset: int | None = 0, - ) -> StringArray: ... + ) -> StringArray: + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ class StringViewArray(Array[scalar.StringViewScalar]): ... class BinaryArray(Array[scalar.BinaryScalar]): @property - def total_values_length(self) -> int: ... + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this BinaryArray. + """ class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): @property - def total_values_length(self) -> int: ... + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this LargeBinaryArray. + """ class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): def dictionary_encode(self) -> Self: ... # type: ignore[override] - def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: + """ + Decodes the DictionaryArray to an Array. + """ @property def indices(self) -> Array[Scalar[_IndexT]]: ... @property @@ -1714,7 +3633,30 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): dictionary: Array | np.ndarray | pd.Series, null_count: int = -1, offset: int = 0, - ) -> DictionaryArray[Any, _BasicValueT]: ... + ) -> DictionaryArray[Any, _BasicValueT]: + """ + Construct a DictionaryArray from buffers. + + Parameters + ---------- + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing the indices array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the indices array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + + Returns + ------- + dict_array : DictionaryArray + """ @staticmethod def from_arrays( indices: Indices, @@ -1724,11 +3666,61 @@ class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): from_pandas: bool = False, safe: bool = True, memory_pool: MemoryPool | None = None, - ) -> DictionaryArray: ... + ) -> DictionaryArray: + """ + Construct a DictionaryArray from indices and values. + + Parameters + ---------- + indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type + Non-negative integers referencing the dictionary values by zero + based index. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + mask : ndarray or pandas.Series, bool type + True values indicate that indices are actually null. + ordered : bool, default False + Set to True if the category values are ordered. + from_pandas : bool, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1). + safe : bool, default True + If True, check that the dictionary indices are in range. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray + """ class StructArray(Array[scalar.StructScalar]): - def field(self, index: int | str) -> Array: ... - def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + def field(self, index: int | str) -> Array: + """ + Retrieves the child array belonging to field. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Array + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: + """ + Return one individual array for each field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : List[Array] + """ @staticmethod def from_arrays( arrays: Iterable[Array], @@ -1737,8 +3729,51 @@ class StructArray(Array[scalar.StructScalar]): mask=None, memory_pool: MemoryPool | None = None, type: types.StructType | None = None, - ) -> StructArray: ... - def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: ... + ) -> StructArray: + """ + Construct StructArray from collection of arrays representing + each field in the struct. + + Either field names, field instances or a struct type must be passed. + + Parameters + ---------- + arrays : sequence of Array + names : List[str] (optional) + Field names for each struct child. + fields : List[Field] (optional) + Field instances for each struct child. + mask : pyarrow.Array[bool] (optional) + Indicate which values are null (True) or not null (False). + memory_pool : MemoryPool (optional) + For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. + + Returns + ------- + result : StructArray + """ + def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): @overload @@ -1763,6 +3798,24 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal type: DataType | None = None, ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... @staticmethod + def from_arrays(*args, **kwargs): + """ + Construct RunEndEncodedArray from run_ends and values arrays. + + Parameters + ---------- + run_ends : Array (int16, int32, or int64 type) + The run_ends array. + values : Array (any type) + The values array. + type : pyarrow.DataType, optional + The run_end_encoded(run_end_type, value_type) array type. + + Returns + ------- + RunEndEncodedArray + """ + @staticmethod def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] type: DataType, length: int, @@ -1770,13 +3823,69 @@ class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicVal null_count: int = -1, offset=0, children: tuple[Array, Array] | None = None, - ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + ) -> RunEndEncodedArray[Any, _BasicValueT]: + """ + Construct a RunEndEncodedArray from all the parameters that make up an + Array. + + RunEndEncodedArrays do not have buffers, only children arrays, but this + implementation is needed to satisfy the Array interface. + + Parameters + ---------- + type : DataType + The run_end_encoded(run_end_type, value_type) type. + length : int + The logical length of the run-end encoded array. Expected to match + the last value of the run_ends array (children[0]) minus the offset. + buffers : List[Buffer] + Empty List or [None]. + null_count : int, default -1 + The number of null entries in the array. Run-end encoded arrays + are specified to not have valid bits and null_count always equals 0. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array] + Nested type children containing the run_ends and values arrays. + + Returns + ------- + RunEndEncodedArray + """ @property - def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: ... + def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: + """ + An array holding the logical indexes of each run-end. + + The physical offset to the array is applied. + """ @property - def values(self) -> Array[scalar.Scalar[_BasicValueT]]: ... - def find_physical_offset(self) -> int: ... - def find_physical_length(self) -> int: ... + def values(self) -> Array[scalar.Scalar[_BasicValueT]]: + """ + An array holding the values of each run. + + The physical offset to the array is applied. + """ + def find_physical_offset(self) -> int: + """ + Find the physical offset of this REE array. + + This is the offset of the run that contains the value of the first + logical element of this array considering its offset. + + This function uses binary-search, so it has a O(log N) cost. + """ + def find_physical_length(self) -> int: + """ + Find the physical length of this REE array. + + The physical length of an REE is the number of physical values (and + run-ends) necessary to represent the logical range of values from offset + to length. + + This function uses binary-search, so it has a O(log N) cost. + """ _ArrayT = TypeVar("_ArrayT", bound=Array) @@ -1784,30 +3893,321 @@ class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): @property def storage(self) -> Any: ... @staticmethod - def from_storage( - typ: types.BaseExtensionType, storage: _ArrayT - ) -> ExtensionArray[_ArrayT]: ... + def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: + """ + Construct ExtensionArray from type and storage array. + + Parameters + ---------- + typ : DataType + The extension type for the result array. + storage : Array + The underlying storage for the result array. + + Returns + ------- + ext_array : ExtensionArray + """ + +class JsonArray(ExtensionArray[_ArrayT]): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ -class JsonArray(ExtensionArray[_ArrayT]): ... class UuidArray(ExtensionArray[_ArrayT]): ... class FixedShapeTensorArray(ExtensionArray[_ArrayT]): - def to_numpy_ndarray(self) -> np.ndarray: ... - def to_tensor(self) -> Tensor: ... + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self) -> np.ndarray: + """ + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + @classmethod - def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: ... + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + If input array data is not contiguous a copy will be made. + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + +class OpaqueArray(ExtensionArray[_ArrayT]): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array -class OpaqueArray(ExtensionArray[_ArrayT]): ... + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ class Bool8Array(ExtensionArray): - def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: ... + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ @classmethod - def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ @classmethod - def from_numpy(cls, obj: np.ndarray) -> Self: ... + def from_numpy(cls, obj: np.ndarray) -> Self: + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + +def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: + """ + Concatenate the given arrays. + + The contents of the input arrays are copied into the returned array. + + Raises + ------ + ArrowInvalid + If not all of the arrays have the same type. + + Parameters + ---------- + arrays : iterable of pyarrow.Array + Arrays to concatenate, must be identically typed. + memory_pool : MemoryPool, default None + For memory allocations. If None, the default pool is used. + + Examples + -------- + >>> import pyarrow as pa + >>> arr1 = pa.array([2, 4, 5, 100]) + >>> arr2 = pa.array([2, 4]) + >>> pa.concat_arrays([arr1, arr2]) + + [ + 2, + 4, + 5, + 100, + 2, + 4 + ] + + """ -def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: ... -def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: ... +def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: + """ + Create empty array of the given type. + """ __all__ = [ "array", diff --git a/pyarrow-stubs/__lib_pxi/builder.pyi b/pyarrow-stubs/__lib_pxi/builder.pyi index 27a0a954dcc..4a0e9ca4708 100644 --- a/pyarrow-stubs/__lib_pxi/builder.pyi +++ b/pyarrow-stubs/__lib_pxi/builder.pyi @@ -5,19 +5,83 @@ from pyarrow.lib import MemoryPool, _Weakrefable from .array import StringArray, StringViewArray class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): ... - def append_values(self, values: Iterable[str | bytes | None]): ... - def finish(self) -> StringArray: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ @property def null_count(self) -> int: ... def __len__(self) -> int: ... class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): ... - def append_values(self, values: Iterable[str | bytes | None]): ... - def finish(self) -> StringViewArray: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringViewArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ @property def null_count(self) -> int: ... def __len__(self) -> int: ... diff --git a/pyarrow-stubs/__lib_pxi/device.pyi b/pyarrow-stubs/__lib_pxi/device.pyi index ed999541a49..d1b9f39eedd 100644 --- a/pyarrow-stubs/__lib_pxi/device.pyi +++ b/pyarrow-stubs/__lib_pxi/device.pyi @@ -19,21 +19,70 @@ class DeviceAllocationType(enum.Flag): HEXAGON = enum.auto() class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + @property - def type_name(self) -> str: ... + def type_name(self) -> str: + """ + A shorthand for this device's type. + """ @property - def device_id(self) -> int: ... + def device_id(self) -> int: + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ @property - def device_type(self) -> DeviceAllocationType: ... + def device_type(self) -> DeviceAllocationType: + """ + Return the DeviceAllocationType of this device. + """ class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ @property - def device(self) -> Device: ... + def device(self) -> Device: + """ + The device this MemoryManager is tied to. + """ @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + +def default_cpu_memory_manager() -> MemoryManager: + """ + Return the default CPU MemoryManager instance. -def default_cpu_memory_manager() -> MemoryManager: ... + The returned singleton instance uses the default MemoryPool. + """ __all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi index d14eaa3937b..d882fd79d57 100644 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ b/pyarrow-stubs/__lib_pxi/io.pyi @@ -22,19 +22,77 @@ from pyarrow.lib import MemoryPool, _Weakrefable from .device import Device, DeviceAllocationType, MemoryManager from .types import KeyValueMetadata -def have_libhdfs() -> bool: ... -def io_thread_count() -> int: ... -def set_io_thread_count(count: int) -> None: ... +def have_libhdfs() -> bool: + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ + +def io_thread_count() -> int: + """ + Return the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. The number of threads is set to a fixed value at + startup. It can be modified at runtime by calling + :func:`set_io_thread_count()`. + + See Also + -------- + set_io_thread_count : Modify the size of this pool. + cpu_count : The analogous function for the CPU thread pool. + """ + +def set_io_thread_count(count: int) -> None: + """ + Set the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. + + Parameters + ---------- + count : int + The max number of threads that may be used for I/O. + Must be positive. + + See Also + -------- + io_thread_count : Get the size of this pool. + set_cpu_count : The analogous function for the CPU thread pool. + """ Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] class NativeFile(_Weakrefable): + """ + The base class for all Arrow streams. + + Streams are either readable, writable, or both. + They optionally support seeking. + + While this class exposes methods to read or write data from Python, the + primary intent of using a Arrow stream is to pass it to other Arrow + facilities that will make use of it, such as Arrow IPC routines. + + Be aware that there are subtle differences with regular Python files, + e.g. destroying a writable Arrow stream without closing it explicitly + will not flush any pending data. + """ + _default_chunk_size: int def __enter__(self) -> Self: ... def __exit__(self, *args) -> None: ... @property - def mode(self) -> Mode: ... + def mode(self) -> Mode: + """ + The file mode. Currently instances of NativeFile may support: + + * rb: binary read + * wb: binary write + * rb+: binary read and write + * ab: binary append + """ def readable(self) -> bool: ... def seekable(self) -> bool: ... def isatty(self) -> bool: ... @@ -42,50 +100,376 @@ class NativeFile(_Weakrefable): @property def closed(self) -> bool: ... def close(self) -> None: ... - def size(self) -> int: ... - def metadata(self) -> KeyValueMetadata: ... - def tell(self) -> int: ... - def seek(self, position: int, whence: int = 0) -> int: ... - def flush(self) -> None: ... - def write(self, data: bytes | SupportPyBuffer) -> int: ... - def read(self, nbytes: int | None = None) -> bytes: ... - def get_stream(self, file_offset: int, nbytes: int) -> Self: ... - def read_at(self) -> bytes: ... - def read1(self) -> bytes: ... + def size(self) -> int: + """ + Return file size + """ + def metadata(self) -> KeyValueMetadata: + """ + Return file metadata + """ + def tell(self) -> int: + """ + Return current stream position + """ + def seek(self, position: int, whence: int = 0) -> int: + """ + Change current file stream position + + Parameters + ---------- + position : int + Byte offset, interpreted relative to value of whence argument + whence : int, default 0 + Point of reference for seek offset + + Notes + ----- + Values of whence: + * 0 -- start of stream (the default); offset should be zero or positive + * 1 -- current stream position; offset may be negative + * 2 -- end of stream; offset is usually negative + + Returns + ------- + int + The new absolute stream position. + """ + def flush(self) -> None: + """ + Flush the stream, if applicable. + + An error is raised if stream is not writable. + """ + def write(self, data: bytes | SupportPyBuffer) -> int: + """ + Write data to the file. + + Parameters + ---------- + data : bytes-like object or exporter of buffer protocol + + Returns + ------- + int + nbytes: number of bytes written + """ + def read(self, nbytes: int | None = None) -> bytes: + """ + Read and return up to n bytes. + + If *nbytes* is None, then the entire remaining file contents are read. + + Parameters + ---------- + nbytes : int, default None + + Returns + ------- + data : bytes + """ + def get_stream(self, file_offset: int, nbytes: int) -> Self: + """ + Return an input stream that reads a file segment independent of the + state of the file. + + Allows reading portions of a random access file as an input stream + without interfering with each other. + + Parameters + ---------- + file_offset : int + nbytes : int + + Returns + ------- + stream : NativeFile + """ + def read_at(self) -> bytes: + """ + Read indicated number of bytes at offset from the file + + Parameters + ---------- + nbytes : int + offset : int + + Returns + ------- + data : bytes + """ + def read1(self) -> bytes: + """Read and return up to n bytes. + + Unlike read(), if *nbytes* is None then a chunk is read, not the + entire file. + + Parameters + ---------- + nbytes : int, default None + The maximum number of bytes to read. + + Returns + ------- + data : bytes + """ def readall(self) -> bytes: ... - def readinto(self, b: SupportPyBuffer) -> int: ... - def readline(self, size: int | None = None) -> bytes: ... - def readlines(self, hint: int | None = None) -> list[bytes]: ... + def readinto(self, b: SupportPyBuffer) -> int: + """ + Read into the supplied buffer + + Parameters + ---------- + b : buffer-like object + A writable buffer object (such as a bytearray). + + Returns + ------- + written : int + number of bytes written + """ + + def readline(self, size: int | None = None) -> bytes: + """Read and return a line of bytes from the file. + + If size is specified, read at most size bytes. + + Line terminator is always b"\\n". + + Parameters + ---------- + size : int + maximum number of bytes read + """ + def readlines(self, hint: int | None = None) -> list[bytes]: + """Read lines of the file + + Parameters + ---------- + hint : int + maximum number of bytes read until we stop + """ def __iter__(self) -> Self: ... def __next__(self) -> bytes: ... - def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: + """ + Read from buffer. + + Parameters + ---------- + nbytes : int, optional + maximum number of bytes read + """ def truncate(self) -> None: ... - def writelines(self, lines: list[bytes]): ... - def download( - self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None - ) -> None: ... - def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... + def writelines(self, lines: list[bytes]): + """ + Write lines to the file. + + Parameters + ---------- + lines : iterable + Iterable of bytes-like objects or exporters of buffer protocol + """ + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: + """ + Read this file completely to a local path or destination stream. + + This method first seeks to the beginning of the file. + + Parameters + ---------- + stream_or_path : str or file-like object + If a string, a local file path to write to; otherwise, + should be a writable stream. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + def upload(self, stream: IOBase, buffer_size: int | None) -> None: + """ + Write from a source stream to this file. + + Parameters + ---------- + stream : file-like object + Source stream to pipe to this file. + buffer_size : int, optional + The buffer size to use for data transfers. + """ # ---------------------------------------------------------------------- # Python file-like objects class PythonFile(NativeFile): + """ + A stream backed by a Python file object. + + This class allows using Python file objects with arbitrary Arrow + functions, including functions written in another language than Python. + + As a downside, there is a non-zero redirection cost in translating + Arrow stream calls to Python method calls. Furthermore, Python's + Global Interpreter Lock may limit parallelism in some situations. + + Examples + -------- + >>> import io + >>> import pyarrow as pa + >>> pa.PythonFile(io.BytesIO()) + + + Create a stream for writing: + + >>> buf = io.BytesIO() + >>> f = pa.PythonFile(buf, mode="w") + >>> f.writable() + True + >>> f.write(b"PythonFile") + 10 + >>> buf.getvalue() + b'PythonFile' + >>> f.close() + >>> f + + + Create a stream for reading: + + >>> buf = io.BytesIO(b"PythonFile") + >>> f = pa.PythonFile(buf, mode="r") + >>> f.mode + 'rb' + >>> f.read() + b'PythonFile' + >>> f + + >>> f.close() + >>> f + + """ def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... - def truncate(self, pos: int | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: + """ + Parameters + ---------- + pos : int, optional + """ class MemoryMappedFile(NativeFile): + """ + A stream that represents a memory-mapped file. + + Supports 'r', 'r+', 'w' modes. + + Examples + -------- + Create a new file with memory map: + + >>> import pyarrow as pa + >>> mmap = pa.create_memory_map("example_mmap.dat", 10) + >>> mmap + + >>> mmap.close() + + Open an existing file with memory map: + + >>> with pa.memory_map("example_mmap.dat") as mmap: + ... mmap + + """ @classmethod - def create(cls, path: str, size: int) -> Self: ... + def create(cls, path: str, size: int) -> Self: + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... - def resize(self, new_size: int) -> None: ... + def resize(self, new_size: int) -> None: + """ + Resize the map and underlying file. + + Parameters + ---------- + new_size : new size in bytes + """ def memory_map( path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" -) -> MemoryMappedFile: ... +) -> MemoryMappedFile: + """ + Open memory map at file path. Size of the memory map cannot change. + + Parameters + ---------- + path : str + mode : {'r', 'r+', 'w'}, default 'r' + Whether the file is opened for reading ('r'), writing ('w') + or both ('r+'). + + Returns + ------- + mmap : MemoryMappedFile + + Examples + -------- + Reading from a memory map without any memory allocation or copying: + + >>> import pyarrow as pa + >>> with pa.output_stream("example_mmap.txt") as stream: + ... stream.write(b"Constructing a buffer referencing the mapped memory") + 51 + >>> with pa.memory_map("example_mmap.txt") as mmap: + ... mmap.read_at(6, 45) + b'memory' + """ create_memory_map = MemoryMappedFile.create class OSFile(NativeFile): + """ + A stream backed by a regular file descriptor. + + Examples + -------- + Create a new file to write to: + + >>> import pyarrow as pa + >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: + ... f.writable() + ... f.write(b"OSFile") + ... f.seekable() + True + 6 + False + + Open the file to read: + + >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: + ... f.mode + ... f.read() + 'rb' + b'OSFile' + + Open the file to append: + + >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: + ... f.mode + ... f.write(b" is super!") + 'ab' + 10 + >>> with pa.OSFile("example_osfile.arrow") as f: + ... f.read() + b'OSFile is super!' + + Inspect created OSFile: + + >>> pa.OSFile("example_osfile.arrow") + + """ def __init__( self, path: str, @@ -94,6 +478,28 @@ class OSFile(NativeFile): ) -> None: ... class FixedSizeBufferWriter(NativeFile): + """ + A stream writing to a Arrow buffer. + + Examples + -------- + Create a stream to write to ``pyarrow.Buffer``: + + >>> import pyarrow as pa + >>> buf = pa.allocate_buffer(5) + >>> with pa.output_stream(buf) as stream: + ... stream.write(b"abcde") + ... stream + 5 + + + Inspect the buffer: + + >>> buf.to_pybytes() + b'abcde' + >>> buf + + """ def __init__(self, buffer: Buffer) -> None: ... def set_memcopy_threads(self, num_threads: int) -> None: ... def set_memcopy_blocksize(self, blocksize: int) -> None: ... @@ -103,37 +509,135 @@ class FixedSizeBufferWriter(NativeFile): # Arrow buffers class Buffer(_Weakrefable): + """ + The base class for all Arrow buffers. + + A buffer represents a contiguous memory area. Many buffers will own + their memory, though not all of them do. + """ def __len__(self) -> int: ... def _assert_cpu(self) -> None: ... @property - def size(self) -> int: ... + def size(self) -> int: + """ + The buffer size in bytes. + """ @property - def address(self) -> int: ... - def hex(self) -> bytes: ... + def address(self) -> int: + """ + The buffer's address, as an integer. + + The returned address may point to CPU or device memory. + Use `is_cpu()` to disambiguate. + """ + def hex(self) -> bytes: + """ + Compute hexadecimal representation of the buffer. + + Returns + ------- + : bytes + """ @property - def is_mutable(self) -> bool: ... + def is_mutable(self) -> bool: + """ + Whether the buffer is mutable. + """ @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether the buffer is CPU-accessible. + """ @property - def device(self) -> Device: ... + def device(self) -> Device: + """ + The device where the buffer resides. + + Returns + ------- + Device + """ @property - def memory_manager(self) -> MemoryManager: ... + def memory_manager(self) -> MemoryManager: + """ + The memory manager associated with the buffer. + + Returns + ------- + MemoryManager + """ @property - def device_type(self) -> DeviceAllocationType: ... + def device_type(self) -> DeviceAllocationType: + """ + The device type where the buffer resides. + + Returns + ------- + DeviceAllocationType + """ @property def parent(self) -> Buffer | None: ... @overload def __getitem__(self, key: slice) -> Self: ... @overload def __getitem__(self, key: int) -> int: ... - def slice(self, offset: int = 0, length: int | None = None) -> Self: ... - def equals(self, other: Self) -> bool: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Slice this buffer. Memory is not copied. + + You can also use the Python slice notation ``buffer[start:stop]``. + + Parameters + ---------- + offset : int, default 0 + Offset from start of buffer to slice. + length : int, default None + Length of slice (default is until end of Buffer starting from + offset). + + Returns + ------- + sliced : Buffer + A logical view over this buffer. + """ + def equals(self, other: Self) -> bool: + """ + Determine if two buffers contain exactly the same data. + + Parameters + ---------- + other : Buffer + + Returns + ------- + are_equal : bool + True if buffer contents and size are equal + """ def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... - def to_pybytes(self) -> bytes: ... + def to_pybytes(self) -> bytes: + """ + Return this buffer as a Python bytes object. Memory is copied. + """ def __buffer__(self, flags: int, /) -> memoryview: ... class ResizableBuffer(Buffer): - def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + """ + A base class for buffers that can be resized. + """ + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: + """ + Resize buffer to indicated size. + + Parameters + ---------- + new_size : int + New size of buffer (padding may be added internally). + shrink_to_fit : bool, default False + If this is true, the buffer is shrunk when new_size is less + than the current size. + If this is false, the buffer is never shrunk. + """ @overload def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... @@ -145,19 +649,127 @@ def allocate_buffer( def allocate_buffer( size: int, memory_pool: MemoryPool | None, resizable: Literal[True] ) -> ResizableBuffer: ... +def allocate_buffer(*args, **kwargs): + """ + Allocate a mutable buffer. + + Parameters + ---------- + size : int + Number of bytes to allocate (plus internal padding) + memory_pool : MemoryPool, optional + The pool to allocate memory from. + If not given, the default memory pool is used. + resizable : bool, default False + If true, the returned buffer is resizable. + + Returns + ------- + buffer : Buffer or ResizableBuffer + """ # ---------------------------------------------------------------------- # Arrow Stream class BufferOutputStream(NativeFile): + """ + An output stream that writes to a resizable buffer. + + The buffer is produced as a result when ``getvalue()`` is called. + + Examples + -------- + Create an output stream, write data to it and finalize it with + ``getvalue()``: + + >>> import pyarrow as pa + >>> f = pa.BufferOutputStream() + >>> f.write(b"pyarrow.Buffer") + 14 + >>> f.closed + False + >>> f.getvalue() + + >>> f.closed + True + """ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def getvalue(self) -> Buffer: ... + def getvalue(self) -> Buffer: + """ + Finalize output stream and return result as pyarrow.Buffer. + + Returns + ------- + value : Buffer + """ class MockOutputStream(NativeFile): ... class BufferReader(NativeFile): + """ + Zero-copy reader from objects convertible to Arrow buffer. + + Parameters + ---------- + obj : Python bytes or pyarrow.Buffer + + Examples + -------- + Create an Arrow input stream and inspect it: + + >>> import pyarrow as pa + >>> data = b"reader data" + >>> buf = memoryview(data) + >>> with pa.input_stream(buf) as stream: + ... stream.size() + ... stream.read(6) + ... stream.seek(7) + ... stream.read(15) + 11 + b'reader' + 7 + b'data' + """ def __init__(self, obj) -> None: ... class CompressedInputStream(NativeFile): + """ + An input stream wrapper which decompresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + + Create an input stream with decompression referencing the + buffer with compressed data: + + >>> cdata = raw.getvalue() + >>> with pa.input_stream(cdata, compression="gzip") as compressed: + ... compressed.read() + b'Compressed stream' + + which actually translates to the use of ``BufferReader``and + ``CompressedInputStream``: + + >>> raw = pa.BufferReader(cdata) + >>> with pa.CompressedInputStream(raw, "gzip") as compressed: + ... compressed.read() + b'Compressed stream' + """ + def __init__( self, stream: StrPath | NativeFile | IOBase, @@ -165,6 +777,27 @@ class CompressedInputStream(NativeFile): ) -> None: ... class CompressedOutputStream(NativeFile): + """ + An output stream wrapper which compresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + """ def __init__( self, stream: StrPath | NativeFile | IOBase, @@ -172,18 +805,74 @@ class CompressedOutputStream(NativeFile): ) -> None: ... class BufferedInputStream(NativeFile): + """ + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. + + Parameters + ---------- + stream : NativeFile + The input stream to wrap with the buffer + buffer_size : int + Size of the temporary read buffer. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None ) -> None: ... - def detach(self) -> NativeFile: ... + def detach(self) -> NativeFile: + """ + Release the raw InputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw input stream + """ class BufferedOutputStream(NativeFile): + """ + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. + + Parameters + ---------- + stream : NativeFile + The writable output stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__( self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None ) -> None: ... - def detach(self) -> NativeFile: ... + def detach(self) -> NativeFile: + """ + Flush any buffered writes and release the raw OutputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw output stream. + """ class TransformInputStream(NativeFile): + """ + Transform an input stream. + + Parameters + ---------- + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... class Transcoder: @@ -192,14 +881,83 @@ class Transcoder: def transcoding_input_stream( stream: NativeFile, src_encoding: str, dest_encoding: str -) -> TransformInputStream: ... -def py_buffer(obj: SupportPyBuffer) -> Buffer: ... -def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... +) -> TransformInputStream: + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + then re-encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data. + dest_encoding : str + The codec to use for emitted data. + """ + +def py_buffer(obj: SupportPyBuffer) -> Buffer: + """ + Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. + """ + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: + """ + Construct an Arrow buffer with the given *address* and *size*. + + The buffer will be optionally backed by the Python *base* object, if given. + The *base* object will be kept alive as long as this buffer is alive, + including across language boundaries (for example if the buffer is + referenced by C++ code). + + Parameters + ---------- + address : int + The starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + The size of device buffer in bytes. + base : {None, object} + Object that owns the referenced memory. + """ + def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... # --------------------------------------------------------------------- class CacheOptions(_Weakrefable): + """ + Cache options for a pre-buffered fragment scan. + + Parameters + ---------- + hole_size_limit : int, default 8KiB + The maximum distance in bytes between two consecutive ranges; beyond + this value, ranges are not combined. + range_size_limit : int, default 32MiB + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, + they are not combined + lazy : bool, default True + lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + range that is currently being read. + prefetch_limit : int, default 0 + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target + range. + """ + hole_size_limit: int range_size_limit: int lazy: bool @@ -219,26 +977,182 @@ class CacheOptions(_Weakrefable): transfer_bandwidth_mib_per_sec: int, ideal_bandwidth_utilization_frac: float = 0.9, max_ideal_request_size_mib: int = 64, - ) -> Self: ... + ) -> Self: + """ + Create suitable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int, default 0.9 + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int, default 64 + The maximum single data request size (in MiB) to maximize the net data load. + + Returns + ------- + CacheOptions + """ class Codec(_Weakrefable): + """ + Compression codec. + + Parameters + ---------- + compression : str + Type of compression codec to initialize, valid values are: 'gzip', + 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and + 'snappy'. + compression_level : int, None + Optional parameter specifying how aggressively to compress. The + possible ranges and effect of this parameter depend on the specific + codec chosen. Higher values compress more but typically use more + resources (CPU/RAM). Some codecs support negative values. + + gzip + The compression_level maps to the memlevel parameter of + deflateInit2. Higher levels use more RAM but are faster + and should have higher compression ratios. + + bz2 + The compression level maps to the blockSize100k parameter of + the BZ2_bzCompressInit function. Higher levels use more RAM + but are faster and should have higher compression ratios. + + brotli + The compression level maps to the BROTLI_PARAM_QUALITY + parameter. Higher values are slower and should have higher + compression ratios. + + lz4/lz4_frame/lz4_raw + The compression level parameter is not supported and must + be None + + zstd + The compression level maps to the compressionLevel parameter + of ZSTD_initCStream. Negative values are supported. Higher + values are slower and should have higher compression ratios. + + snappy + The compression level parameter is not supported and must + be None + + + Raises + ------ + ValueError + If invalid compression value is passed. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.Codec.is_available("gzip") + True + >>> codec = pa.Codec("gzip") + >>> codec.name + 'gzip' + >>> codec.compression_level + 9 + """ def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... @classmethod - def detect(cls, path: StrPath) -> Self: ... + def detect(cls, path: StrPath) -> Self: + """ + Detect and instantiate compression codec based on file extension. + + Parameters + ---------- + path : str, path-like + File-path to detect compression from. + + Raises + ------ + TypeError + If the passed value is not path-like. + ValueError + If the compression can't be detected from the path. + + Returns + ------- + Codec + """ @staticmethod - def is_available(compression: Compression) -> bool: ... + def is_available(compression: Compression) -> bool: + """ + Returns whether the compression support has been built and enabled. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + + Returns + ------- + bool + """ @staticmethod - def supports_compression_level(compression: Compression) -> int: ... + def supports_compression_level(compression: Compression) -> int: + """ + Returns true if the compression level parameter is supported + for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ @staticmethod - def default_compression_level(compression: Compression) -> int: ... + def default_compression_level(compression: Compression) -> int: + """ + Returns the compression level that Arrow will use for the codec if + None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ @staticmethod - def minimum_compression_level(compression: Compression) -> int: ... + def minimum_compression_level(compression: Compression) -> int: + """ + Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ @staticmethod - def maximum_compression_level(compression: Compression) -> int: ... + def maximum_compression_level(compression: Compression) -> int: + """ + Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ @property - def name(self) -> Compression: ... + def name(self) -> Compression: + """Returns the name of the codec""" @property - def compression_level(self) -> int: ... + def compression_level(self) -> int: + """Returns the compression level parameter of the codec""" @overload def compress( self, @@ -262,6 +1176,22 @@ class Codec(_Weakrefable): asbytes: Literal[True], memory_pool: MemoryPool | None = None, ) -> bytes: ... + def compress(self, *args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ @overload def decompress( self, @@ -288,6 +1218,24 @@ class Codec(_Weakrefable): asbytes: Literal[True], memory_pool: MemoryPool | None = None, ) -> bytes: ... + def decompress(self, *args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + decompressed_size : int, default None + Size of the decompressed result + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ @overload def compress( @@ -312,6 +1260,26 @@ def compress( asbytes: Literal[True], memory_pool: MemoryPool | None = None, ) -> bytes: ... +def compress(*args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + @overload def decompress( buf: Buffer | bytes | SupportPyBuffer, @@ -338,16 +1306,137 @@ def decompress( asbytes: Literal[True], memory_pool: MemoryPool | None = None, ) -> bytes: ... +def decompress(*args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + Input object to decompress data from. + decompressed_size : int, default None + Size of the decompressed result + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + def input_stream( source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, -) -> BufferReader: ... +) -> BufferReader: + """ + Create an Arrow input stream. + + Parameters + ---------- + source : str, Path, buffer, or file-like object + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Examples + -------- + Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: + + >>> import pyarrow as pa + >>> buf = memoryview(b"some data") + >>> with pa.input_stream(buf) as stream: + ... stream.read(4) + b'some' + + Create a readable OSFile (NativeFile) from a string or file path: + + >>> import gzip + >>> with gzip.open("example.gz", "wb") as f: + ... f.write(b"some data") + 9 + >>> with pa.input_stream("example.gz") as stream: + ... stream.read() + b'some data' + + Create a readable PythonFile (NativeFile) from a a Python file object: + + >>> with open("example.txt", mode="w") as f: + ... f.write("some text") + 9 + >>> with pa.input_stream("example.txt") as stream: + ... stream.read(6) + b'some t' + """ + def output_stream( source: StrPath | Buffer | IOBase, compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", buffer_size: int | None = None, -) -> NativeFile: ... +) -> NativeFile: + """ + Create an Arrow output stream. + + Parameters + ---------- + source : str, Path, buffer, file-like object + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Examples + -------- + Create a writable NativeFile from a pyarrow Buffer: + + >>> import pyarrow as pa + >>> data = b"buffer data" + >>> empty_obj = bytearray(11) + >>> buf = pa.py_buffer(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read(6) + b'buffer' + + or from a memoryview object: + + >>> buf = memoryview(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read() + b'buffer data' + + Create a writable NativeFile from a string or file path: + + >>> with pa.output_stream("example_second.txt") as stream: + ... stream.write(b"Write some data") + 15 + >>> with pa.input_stream("example_second.txt") as stream: + ... stream.read() + b'Write some data' + """ __all__ = [ "have_libhdfs", diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi index 59942db594f..3d72892061e 100644 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ b/pyarrow-stubs/__lib_pxi/ipc.pyi @@ -25,6 +25,22 @@ class MetadataVersion(enum.IntEnum): V5 = enum.auto() class WriteStats(NamedTuple): + """IPC write statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + num_messages: int num_record_batches: int num_dictionary_batches: int @@ -32,6 +48,22 @@ class WriteStats(NamedTuple): num_replaced_dictionaries: int class ReadStats(NamedTuple): + """IPC read statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + num_messages: int num_record_batches: int num_dictionary_batches: int @@ -39,6 +71,22 @@ class ReadStats(NamedTuple): num_replaced_dictionaries: int class IpcReadOptions(_Weakrefable): + """ + Serialization options for reading IPC format. + + Parameters + ---------- + ensure_native_endian : bool, default True + Whether to convert incoming data to platform-native endianness. + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like decompression + included_fields : list + If empty (the default), return all deserialized fields. + If non-empty, the values are the indices of fields to read on + the top-level schema + """ + ensure_native_endian: bool use_threads: bool included_fields: list[int] @@ -51,6 +99,40 @@ class IpcReadOptions(_Weakrefable): ) -> None: ... class IpcWriteOptions(_Weakrefable): + """ + Serialization options for the IPC format. + + Parameters + ---------- + metadata_version : MetadataVersion, default MetadataVersion.V5 + The metadata version to write. V5 is the current and latest, + V4 is the pre-1.0 metadata version (with incompatible Union layout). + allow_64bit : bool, default False + If true, allow field lengths that don't fit in a signed 32-bit int. + use_legacy_format : bool, default False + Whether to use the pre-Arrow 0.15 IPC format. + compression : str, Codec, or None + compression codec to use for record batch buffers. + If None then batch buffers will be uncompressed. + Must be "lz4", "zstd" or None. + To specify a compression_level use `pyarrow.Codec` + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like compression. + emit_dictionary_deltas : bool + Whether to emit dictionary deltas. Default is false for maximum + stream compatibility. + unify_dictionaries : bool + If true then calls to write_table will attempt to unify dictionaries + across all batches in the table. This can help avoid the need for + replacement dictionaries (which the file format does not support) + but requires computing the unified dictionary and then remapping + the indices arrays. + + This parameter is ignored when writing to the IPC stream format as + the IPC stream format can support replacement dictionaries. + """ + metadata_version: MetadataVersion allow_64bit: bool use_legacy_format: bool @@ -71,6 +153,10 @@ class IpcWriteOptions(_Weakrefable): ) -> None: ... class Message(_Weakrefable): + """ + Container for an Arrow IPC message with metadata and optional body + """ + @property def type(self) -> str: ... @property @@ -82,84 +168,409 @@ class Message(_Weakrefable): def equals(self, other: Message) -> bool: ... def serialize_to( self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None - ): ... - def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: ... + ): + """ + Write message to generic OutputStream + + Parameters + ---------- + sink : NativeFile + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + """ + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write message as encapsulated IPC message + + Parameters + ---------- + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + """ class MessageReader(_Weakrefable): + """ + Interface for reading Message objects from some source (like an + InputStream) + """ @classmethod - def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: ... + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: + """ + Open stream from source, if you want to use memory map use + MemoryMappedFile as source. + + Parameters + ---------- + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object + A readable source, like an InputStream + """ def __iter__(self) -> Self: ... - def read_next_message(self) -> Message: ... + def read_next_message(self) -> Message: + """ + Read next Message from the stream. + + Raises + ------ + StopIteration + At end of stream + """ __next__ = read_next_message # ---------------------------------------------------------------------- # File and stream readers and writers class _CRecordBatchWriter(_Weakrefable): - def write(self, table_or_batch: Table | RecordBatch): ... + """The base RecordBatchWriter wrapper. + + Provides common implementations of convenience methods. Should not + be instantiated directly by user code. + """ + def write(self, table_or_batch: Table | RecordBatch): + """ + Write RecordBatch or Table to stream. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + """ def write_batch( self, batch: RecordBatch, custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, - ): ... - def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... - def close(self) -> None: ... + ): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + custom_metadata : mapping or KeyValueMetadata + Keys and values must be string-like / coercible to bytes + """ + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @property - def stats(self) -> WriteStats: ... + def stats(self) -> WriteStats: + """ + Current IPC write statistics. + """ class _RecordBatchStreamWriter(_CRecordBatchWriter): def __dealloc__(self) -> None: ... def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... class _ReadPandasMixin: - def read_pandas(self, **options) -> pd.DataFrame: ... + def read_pandas(self, **options) -> pd.DataFrame: + """ + Read contents of stream to a pandas.DataFrame. + + Read all record batches as a pyarrow.Table then convert it to a + pandas.DataFrame using Table.to_pandas. + + Parameters + ---------- + **options + Arguments to forward to :meth:`Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + """ class RecordBatchReader(_Weakrefable): + """Base class for reading stream of record batches. + + Record batch readers function as iterators of record batches that also + provide the schema (without the need to get any batches). + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatchReader.from_*`` functions instead. + + Notes + ----- + To import and export using the Arrow C stream interface, use the + ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this + interface is intended for expert users. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([("x", pa.int64())]) + >>> def iter_record_batches(): + ... for i in range(2): + ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) + >>> print(reader.schema) + x: int64 + >>> for batch in reader: + ... print(batch) + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + """ + def __iter__(self) -> Self: ... - def read_next_batch(self) -> RecordBatch: ... + def read_next_batch(self) -> RecordBatch: + """ + Read next RecordBatch from the stream. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + RecordBatch + """ __next__ = read_next_batch @property - def schema(self) -> Schema: ... - def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + def schema(self) -> Schema: + """ + Shared schema of the record batches in the stream. + + Returns + ------- + Schema + """ + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: + """ + Read next RecordBatch from the stream along with its custom metadata. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ def iter_batches_with_custom_metadata( self, - ) -> Iterator[RecordBatchWithMetadata]: ... - def read_all(self) -> Table: ... + ) -> Iterator[RecordBatchWithMetadata]: + """ + Iterate over record batches from the stream along with their custom + metadata. + + Yields + ------ + RecordBatchWithMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table. + + Returns + ------- + Table + """ read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def close(self) -> None: ... + def close(self) -> None: + """ + Release any resources associated with the reader. + """ def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... - def cast(self, target_schema: Schema) -> Self: ... - def _export_to_c(self, out_ptr: int) -> None: ... + def cast(self, target_schema: Schema) -> Self: + """ + Wrap this reader with one that casts each batch lazily as it is pulled. + Currently only a safe cast to target_schema is implemented. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + + Returns + ------- + RecordBatchReader + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowArrayStream struct, given its pointer. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + Be careful: if you don't pass the ArrowArrayStream struct to a + consumer, array memory will leak. This is a low-level function + intended for expert users. + """ @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: ... - def __arrow_c_stream__(self, requested_schema=None): ... + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream struct, + given its pointer. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ @classmethod - def _import_from_c_capsule(cls, stream) -> Self: ... + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + RecordBatchReader + """ @classmethod - def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: ... + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: + """ + Create RecordBatchReader from a Arrow-compatible stream object. + + This accepts objects implementing the Arrow PyCapsule Protocol for + streams, i.e. objects that have a ``__arrow_c_stream__`` method. + + Parameters + ---------- + data : Arrow-compatible stream object + Any object that implements the Arrow PyCapsule Protocol for + streams. + schema : Schema, default None + The schema to which the stream should be casted, if supported + by the stream object. + + Returns + ------- + RecordBatchReader + """ @classmethod - def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: ... + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: + """ + Create RecordBatchReader from an iterable of batches. + + Parameters + ---------- + schema : Schema + The shared schema of the record batches + batches : Iterable[RecordBatch] + The batches that this reader will return. + + Returns + ------- + reader : RecordBatchReader + """ class _RecordBatchStreamReader(RecordBatchReader): @property - def stats(self) -> ReadStats: ... + def stats(self) -> ReadStats: + """ + Current IPC read statistics. + """ class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... class RecordBatchWithMetadata(NamedTuple): + """RecordBatch with its custom metadata + + Parameters + ---------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + batch: RecordBatch custom_metadata: KeyValueMetadata class _RecordBatchFileReader(_Weakrefable): @property - def num_record_batches(self) -> int: ... - def get_batch(self, i: int) -> RecordBatch: ... + def num_record_batches(self) -> int: + """ + The number of record batches in the IPC file. + """ + def get_batch(self, i: int) -> RecordBatch: + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ get_record_batch = get_batch - def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... - def read_all(self) -> Table: ... + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: + """ + Read the record batch with the given index along with + its custom metadata + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table + """ read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_val, exc_tb): ... @@ -168,17 +579,105 @@ class _RecordBatchFileReader(_Weakrefable): @property def stats(self) -> ReadStats: ... -def get_tensor_size(tensor: Tensor) -> int: ... -def get_record_batch_size(batch: RecordBatch) -> int: ... -def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... -def read_tensor(source: NativeFile) -> Tensor: ... -def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... -def read_schema( - obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None -) -> Schema: ... +def get_tensor_size(tensor: Tensor) -> int: + """ + Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. + """ + +def get_record_batch_size(batch: RecordBatch) -> int: + """ + Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. + """ + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: + """ + Write pyarrow.Tensor to pyarrow.NativeFile object its current position. + + Parameters + ---------- + tensor : pyarrow.Tensor + dest : pyarrow.NativeFile + + Returns + ------- + bytes_written : int + Total number of bytes written to the file + """ + +def read_tensor(source: NativeFile) -> Tensor: + """Read pyarrow.Tensor from pyarrow.NativeFile object from current + position. If the file source supports zero copy (e.g. a memory map), then + this operation does not allocate any memory. This function not assume that + the stream is aligned + + Parameters + ---------- + source : pyarrow.NativeFile + + Returns + ------- + tensor : Tensor + + """ + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: + """ + Read length-prefixed message from file or buffer-like object + + Parameters + ---------- + source : pyarrow.NativeFile, file-like object, or buffer-like object + + Returns + ------- + message : Message + """ + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: + """ + Read Schema from message or buffer + + Parameters + ---------- + obj : buffer or Message + dictionary_memo : DictionaryMemo, optional + Needed to be able to reconstruct dictionary-encoded fields + with read_record_batch + + Returns + ------- + schema : Schema + """ + def read_record_batch( obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None -) -> RecordBatch: ... +) -> RecordBatch: + """ + Read RecordBatch from message, given a known schema. If reading data from a + complete IPC stream, use ipc.open_stream instead + + Parameters + ---------- + obj : Message or Buffer-like + schema : Schema + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + + Returns + ------- + batch : RecordBatch + """ __all__ = [ "MetadataVersion", diff --git a/pyarrow-stubs/__lib_pxi/memory.pyi b/pyarrow-stubs/__lib_pxi/memory.pyi index 755e689b5f0..57a3bb4f1b3 100644 --- a/pyarrow-stubs/__lib_pxi/memory.pyi +++ b/pyarrow-stubs/__lib_pxi/memory.pyi @@ -1,29 +1,160 @@ from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): - def release_unused(self) -> None: ... - def bytes_allocated(self) -> int: ... - def total_bytes_allocated(self) -> int: ... - def max_memory(self) -> int | None: ... - def num_allocations(self) -> int: ... - def print_stats(self) -> None: ... + """ + Base class for memory allocation. + + Besides tracking its number of allocated bytes, a memory pool also + takes care of the required 64-byte alignment for Arrow data. + """ + + def release_unused(self) -> None: + """ + Attempt to return to the OS any memory being held onto by the pool. + + This function should not be called except potentially for + benchmarking or debugging as it could be expensive and detrimental to + performance. + + This is best effort and may not have any effect on some memory pools + or in some situations (e.g. fragmentation). + """ + def bytes_allocated(self) -> int: + """ + Return the number of bytes that are currently allocated from this + memory pool. + """ + def total_bytes_allocated(self) -> int: + """ + Return the total number of bytes that have been allocated from this + memory pool. + """ + def max_memory(self) -> int | None: + """ + Return the peak memory allocation in this memory pool. + This can be an approximate number in multi-threaded applications. + + None is returned if the pool implementation doesn't know how to + compute this number. + """ + def num_allocations(self) -> int: + """ + Return the number of allocations or reallocations that were made + using this memory pool. + """ + def print_stats(self) -> None: + """ + Print statistics about this memory pool. + + The output format is implementation-specific. Not all memory pools + implement this method. + """ @property - def backend_name(self) -> str: ... + def backend_name(self) -> str: + """ + The name of the backend used by this MemoryPool (e.g. "jemalloc"). + """ class LoggingMemoryPool(MemoryPool): ... class ProxyMemoryPool(MemoryPool): ... -def default_memory_pool() -> MemoryPool: ... -def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... -def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... -def system_memory_pool() -> MemoryPool: ... -def jemalloc_memory_pool() -> MemoryPool: ... -def mimalloc_memory_pool() -> MemoryPool: ... -def set_memory_pool(pool: MemoryPool) -> None: ... -def log_memory_allocations(enable: bool = True) -> None: ... -def total_allocated_bytes() -> int: ... -def jemalloc_set_decay_ms(decay_ms: int) -> None: ... -def supported_memory_backends() -> list[str]: ... +def default_memory_pool() -> MemoryPool: + """ + Return the process-global memory pool. + + Examples + -------- + >>> default_memory_pool() + + """ + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def system_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the C malloc heap. + """ + +def jemalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the jemalloc heap. + + NotImplementedError is raised if jemalloc support is not enabled. + """ + +def mimalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the mimalloc heap. + + NotImplementedError is raised if mimalloc support is not enabled. + """ + +def set_memory_pool(pool: MemoryPool) -> None: + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ + +def log_memory_allocations(enable: bool = True) -> None: + """ + Enable or disable memory allocator logging for debugging purposes + + Parameters + ---------- + enable : bool, default True + Pass False to disable logging + """ + +def total_allocated_bytes() -> int: + """ + Return the currently allocated bytes from the default memory pool. + Other memory pools may not be accounted for. + """ + +def jemalloc_set_decay_ms(decay_ms: int) -> None: + """ + Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of + milliseconds. A value of 0 (the default) results in dirty / muzzy memory + pages being released right away to the OS, while a higher value will result + in a time-based decay. See the jemalloc docs for more information + + It's best to set this at the start of your application. + + Parameters + ---------- + decay_ms : int + Number of milliseconds to set for jemalloc decay conf parameters. Note + that this change will only affect future memory arenas + """ + +def supported_memory_backends() -> list[str]: + """ + Return a list of available memory pool backends + """ __all__ = [ "MemoryPool", diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index e9f4ca02e27..38d13679dec 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -34,10 +34,19 @@ _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") class Scalar(_Weakrefable, Generic[_DataTypeT]): + """ + The base class for scalars. + """ @property - def type(self) -> _DataTypeT: ... + def type(self) -> _DataTypeT: + """ + Data type of the Scalar object. + """ @property - def is_valid(self) -> bool: ... + def is_valid(self) -> bool: + """ + Holds a valid (non-null) value. + """ @overload def cast( self, @@ -54,7 +63,43 @@ class Scalar(_Weakrefable, Generic[_DataTypeT]): options: CastOptions | None = None, memory_pool: MemoryPool | None = None, ) -> Scalar[_DataTypeT]: ... - def validate(self, *, full: bool = False) -> None: ... + def cast(self, *args, **kwargs): + """ + Cast scalar value to another data type. + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast scalar to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + scalar : A Scalar of the given target data type. + """ + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ def equals(self, other: Scalar) -> bool: ... def __hash__(self) -> int: ... @overload @@ -125,6 +170,24 @@ class Scalar(_Weakrefable, Generic[_DataTypeT]): *, maps_as_pydicts: Literal["lossy", "strict"] | None = None, ) -> Any: ... + def as_py(self, *args, **kwargs): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + """ _NULL: TypeAlias = None NA = _NULL @@ -276,7 +339,21 @@ class ExtensionScalar(Scalar[types.ExtensionType]): @property def value(self) -> Any | None: ... @staticmethod - def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: ... + def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: + """ + Construct ExtensionScalar from type and storage value. + + Parameters + ---------- + typ : DataType + The extension type for the result scalar. + value : object + The storage value for the result scalar. + + Returns + ------- + ext_scalar : ExtensionScalar + """ class Bool8Scalar(Scalar[types.Bool8Type]): ... class UuidScalar(Scalar[types.UuidType]): ... @@ -284,8 +361,30 @@ class JsonScalar(Scalar[types.JsonType]): ... class OpaqueScalar(Scalar[types.OpaqueType]): ... class FixedShapeTensorScalar(ExtensionScalar): - def to_numpy(self) -> np.ndarray: ... - def to_tensor(self) -> Tensor: ... + def to_numpy(self) -> np.ndarray: + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ _V = TypeVar("_V") @@ -822,6 +921,45 @@ def scalar( from_pandas: bool | None = None, memory_pool: MemoryPool | None = None, ) -> Scalar[_DataTypeT]: ... +def scalar(*args, **kwargs): + """ + Create a pyarrow.Scalar instance from a Python object. + + Parameters + ---------- + value : Any + Python object coercible to arrow's type system. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the value. + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. Defaults to False if not passed explicitly by user, + or True if a pandas object is passed in. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + scalar : pyarrow.Scalar + + Examples + -------- + >>> import pyarrow as pa + + >>> pa.scalar(42) + + + >>> pa.scalar("string") + + + >>> pa.scalar([1, 2]) + + + >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) + + """ __all__ = [ "Scalar", diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 67e19286add..97a6ede39d9 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -119,11 +119,67 @@ NarySelector: TypeAlias = list[str] | tuple[str, ...] ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): + """ + An array-like composed from a (possibly empty) collection of pyarrow.Arrays + + Warnings + -------- + Do not call this class's constructor directly. + + Examples + -------- + To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: + + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) + True + """ + @property def data(self) -> Self: ... @property - def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... - def length(self) -> int: ... + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: + """ + Return data type of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + """ + def length(self) -> int: + """ + Return length of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.length() + 6 + """ __len__ = length def to_string( self, @@ -132,26 +188,284 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): window: int = 5, container_window: int = 2, skip_new_lines: bool = False, - ) -> str: ... + ) -> str: + """ + Render a "pretty-printed" string representation of the ChunkedArray + + Parameters + ---------- + indent : int + How much to indent right the content of the array, + by default ``0``. + window : int + How many items to preview within each chunk at the begin and end + of the chunk when the chunk is bigger than the window. + The other elements will be ellipsed. + container_window : int + How many chunks to preview at the begin and end + of the array when the array is bigger than the window. + The other elements will be ellipsed. + This setting also applies to list columns. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_string(skip_new_lines=True) + '[[2,2,4],[4,5,100]]' + """ format = to_string - def validate(self, *, full: bool = False) -> None: ... + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ @property - def null_count(self) -> int: ... + def null_count(self) -> int: + """ + Number of null entries + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.null_count + 1 + """ @property - def nbytes(self) -> int: ... - def get_total_buffer_size(self) -> int: ... + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the chunked array. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.nbytes + 49 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the chunked array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.get_total_buffer_size() + 49 + """ def __sizeof__(self) -> int: ... @overload def __getitem__(self, key: slice) -> Self: ... @overload def __getitem__(self, key: int) -> _ScalarT: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or ChunkedArray (slice) + """ def getitem(self, i: int) -> Scalar: ... - def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: ... - def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: ... - def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: ... - def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: ... - def equals(self, other: Self) -> bool: ... - def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_null() + + [ + [ + false, + false, + false, + false, + true, + false + ] + ] + """ + def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the NaN values. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) + >>> arr.is_nan() + + [ + [ + false, + true, + false, + false, + null, + false + ] + ] + """ + def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the non-null values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_valid() + + [ + [ + true, + true, + true + ], + [ + true, + false, + true + ] + ] + """ + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + """ + Replace each null element in values with fill_value. + + See :func:`pyarrow.compute.fill_null` for full usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. + + Examples + -------- + >>> import pyarrow as pa + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.fill_null(fill_value) + + [ + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + ] + """ + def equals(self, other: Self) -> bool: + """ + Return whether the contents of two chunked arrays are equal. + + Parameters + ---------- + other : pyarrow.ChunkedArray + Chunked array to compare against. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> n_legs.equals(n_legs) + True + >>> n_legs.equals(animals) + False + """ + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + """ + Return a NumPy copy of this array (experimental). + + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + + Returns + ------- + array : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_numpy() + array([ 2, 2, 4, 4, 5, 100]) + """ def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... @overload def cast( @@ -164,15 +478,386 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def cast( self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None ) -> ChunkedArray[Scalar[_CastAs]]: ... - def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... - def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: ... - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_ScalarT]: ... - def unique(self) -> ChunkedArray[_ScalarT]: ... - def value_counts(self) -> StructArray: ... - def slice(self, offset: int = 0, length: int | None = None) -> Self: ... - def filter( - self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop" - ) -> Self: ... + def cast(self, *args, **kwargs): + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + + Change the data type of an array: + + >>> n_legs_seconds = n_legs.cast(pa.duration("s")) + >>> n_legs_seconds.type + DurationType(duration[s]) + """ + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : ChunkedArray + A dictionary-encoded version of this array. + + Examples + -------- + >>> import pyarrow as pa + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> animals.dictionary_encode() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : list of ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> c_arr = pa.chunked_array(n_legs.value_counts()) + >>> c_arr + + [ + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + ] + >>> c_arr.flatten() + [ + [ + [ + 2, + 4, + 5, + 100 + ] + ], + [ + [ + 2, + 2, + 1, + 1 + ] + ]] + >>> c_arr.type + StructType(struct) + >>> n_legs.type + DataType(int64) + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_ScalarT]: + """ + Flatten this ChunkedArray into a single non-chunked array. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.combine_chunks() + + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + """ + def unique(self) -> ChunkedArray[_ScalarT]: + """ + Compute distinct elements in array + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.unique() + + [ + 2, + 4, + 5, + 100 + ] + """ + def value_counts(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + An array of structs + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.value_counts() + + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this ChunkedArray + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.slice(2, 2) + + [ + [ + 4 + ], + [ + 4 + ] + ] + """ + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> mask = pa.array([True, False, None, True, False, True]) + >>> n_legs.filter(mask) + + [ + [ + 2 + ], + [ + 4, + 100 + ] + ] + >>> n_legs.filter(mask, null_selection_behavior="emit_null") + + [ + [ + 2, + null + ], + [ + 4, + 100 + ] + ] + """ @overload def index( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], @@ -191,15 +876,318 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): *, memory_pool: MemoryPool | None = None, ) -> Int64Scalar: ... - def take(self, indices: Indices) -> Self: ... - def drop_null(self) -> Self: ... - def sort(self, order: Order = "ascending", **kwargs) -> Self: ... - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.index(4) + + >>> n_legs.index(4, start=3) + + """ + def take(self, indices: Indices) -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.take([1, 4, 5]) + + [ + [ + 2, + 5, + 100 + ] + ] + """ + def drop_null(self) -> Self: + """ + Remove missing values from a chunked array. + See :func:`pyarrow.compute.drop_null` for full description. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.drop_null() + + [ + [ + 2, + 2 + ], + [ + 4, + 5, + 100 + ] + ] + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent chunked array, but where all + chunks share the same dictionary values. Dictionary indices are + transposed accordingly. + + If there are no dictionaries in the chunked array, it is returned + unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> c_arr + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ] + ] + >>> c_arr.unify_dictionaries() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ @property - def num_chunks(self) -> int: ... - def chunk(self, i: int) -> ChunkedArray[_ScalarT]: ... + def num_chunks(self) -> int: + """ + Number of underlying chunks. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.num_chunks + 2 + """ + def chunk(self, i: int) -> ChunkedArray[_ScalarT]: + """ + Select a chunk by its index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.chunk(1) + + [ + 4, + 5, + 100 + ] + """ @property - def chunks(self) -> list[Array[_ScalarT]]: ... + def chunks(self) -> list[Array[_ScalarT]]: + """ + Convert to a list of single-chunked arrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.chunks + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ]] + """ @overload def iterchunks( self: ChunkedArray[scalar.NullScalar], @@ -231,7 +1219,20 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): @overload def iterchunks( self: ChunkedArray[scalar.Int32Scalar], - ) -> Generator[array.Int32Array, None, None]: ... + ) -> Generator[array.Int32Array, None, None]: + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ @overload def iterchunks( self: ChunkedArray[scalar.UInt64Scalar], @@ -372,17 +1373,85 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): def iterchunks( self: ChunkedArray[scalar.OpaqueScalar], ) -> Generator[array.OpaqueArray, None, None]: ... + def iterchunks(self): + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ def __iter__(self) -> Iterator[_ScalarT]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], *, maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: ... - def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.to_pylist() + [2, 2, 4, 4, None, 100] + """ + def __arrow_c_stream__(self, requested_schema=None) -> Any: + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ @classmethod - def _import_from_c_capsule(cls, stream) -> Self: ... + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ @overload def chunked_array( @@ -629,6 +1698,47 @@ def chunked_array( values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, type: Literal["month_day_nano_interval"], ) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +def chunked_array(value, type=None): + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : Array, list of Array, or array-like + Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + """ _ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) @@ -636,154 +1746,2404 @@ class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: ... + ) -> _PyArrowDataFrame: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ @overload def __getitem__(self, key: int | str) -> _ColumnT: ... @overload def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return column at given index or column name + + Parameters + ---------- + key : integer, str, or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + Array (from RecordBatch) or ChunkedArray (from Table) for column input. + RecordBatch or Table for slice input. + """ def __len__(self) -> int: ... - def column(self, i: int | str) -> _ColumnT: ... + def column(self, i: int | str) -> _ColumnT: + """ + Select single column from Table or RecordBatch. + + Parameters + ---------- + i : int or string + The index or name of the column to retrieve. + + Returns + ------- + column : Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Select a column by numeric index: + + >>> table.column(0) + + [ + [ + 2, + 4, + 5, + 100 + ] + ] + + Select a column by its name: + + >>> table.column("animals") + + [ + [ + "Flamingo", + "Horse", + "Brittle stars", + "Centipede" + ] + ] + """ @property - def column_names(self) -> list[str]: ... + def column_names(self) -> list[str]: + """ + Names of the Table or RecordBatch columns. + + Returns + ------- + list of str + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=["n_legs", "animals"], + ... ) + >>> table.column_names + ['n_legs', 'animals'] + """ @property - def columns(self) -> list[_ColumnT]: ... - def drop_null(self) -> Self: ... - def field(self, i: int | str) -> Field: ... + def columns(self) -> list[_ColumnT]: + """ + List of all columns in numerical order. + + Returns + ------- + columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.columns + [ + [ + [ + null, + 4, + 5, + null + ] + ], + [ + [ + "Flamingo", + "Horse", + null, + "Centipede" + ] + ]] + """ + def drop_null(self) -> Self: + """ + Remove rows that contain missing values from a Table or RecordBatch. + + See :func:`pyarrow.compute.drop_null` for full usage. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, with rows containing + no missing values. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [None, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", None, "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.drop_null() + pyarrow.Table + year: double + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def field(self, i: int | str) -> Field: + """ + Select a schema field by its column name or numeric index. + + Parameters + ---------- + i : int or string + The index or name of the field to retrieve. + + Returns + ------- + Field + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.field(0) + pyarrow.Field + >>> table.field(1) + pyarrow.Field + """ @classmethod def from_pydict( cls, mapping: Mapping[str, ArrayOrChunkedArray[Any] | list | np.ndarray], schema: Schema | None = None, metadata: Mapping | None = None, - ) -> Self: ... + ) -> Self: + """ + Construct a Table or RecordBatch from Arrow arrays or columns. + + Parameters + ---------- + mapping : dict or Mapping + A mapping of strings to Arrays or Python lists. + schema : Schema, default None + If not passed, will be inferred from the Mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> pydict = {"n_legs": n_legs, "animals": animals} + + Construct a Table from a dictionary of arrays: + + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a dictionary of arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pydict(pydict, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ @classmethod def from_pylist( cls, mapping: Sequence[Mapping[str, Any]], schema: Schema | None = None, metadata: Mapping | None = None, - ) -> Self: ... - def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + ) -> Self: + """ + Construct a Table or RecordBatch from list of rows / dictionaries. + + Parameters + ---------- + mapping : list of dicts of rows + A mapping of strings to row values. + schema : Schema, default None + If not passed, will be inferred from the first row of the + mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + + Construct a Table from a list of rows: + + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4]] + animals: [["Flamingo","Dog"]] + + Construct a Table from a list of rows with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def itercolumns(self) -> Generator[_ColumnT, None, None]: + """ + Iterator over all columns in their numerical order. + + Yields + ------ + Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> for i in table.itercolumns(): + ... print(i.null_count) + 2 + 1 + """ @property def num_columns(self) -> int: ... @property def num_rows(self) -> int: ... @property - def shape(self) -> tuple[int, int]: ... + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (#rows, #columns). + + Returns + ------- + (int, int) + Number of rows and number of columns. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table.shape + (4, 2) + """ @property def schema(self) -> Schema: ... @property def nbytes(self) -> int: ... - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... - def take(self, indices: Indices) -> Self: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: + """ + Sort the Table or RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + Table or RecordBatch + A new tabular object sorted according to the sort keys. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.sort_by("animal") + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,2021,2021,2020,2022,2022]] + n_legs: [[5,100,4,2,4,2]] + animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] + """ + def take(self, indices: Indices) -> Self: + """ + Select rows from a Table or RecordBatch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the tabular object whose rows will be returned. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, containing the taken rows. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.take([1, 3]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ def filter( self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" - ) -> Self: ... + ) -> Self: + """ + Select rows from the table or record batch based on a boolean mask. + + The Table can be filtered based on a mask, which will be passed to + :func:`pyarrow.compute.filter` to perform the filtering, or it can + be filtered through a boolean :class:`.Expression` + + Parameters + ---------- + mask : Array or array-like or .Expression + The boolean mask or the :class:`.Expression` to filter the table with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled, does nothing if + an :class:`.Expression` is used. + + Returns + ------- + filtered : Table or RecordBatch + A tabular object of the same schema, with only the rows selected + by applied filtering + + Examples + -------- + Using a Table (works similarly for RecordBatch): + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Define an expression and select rows: + + >>> import pyarrow.compute as pc + >>> expr = pc.field("year") <= 2020 + >>> table.filter(expr) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2019]] + n_legs: [[2,5]] + animals: [["Flamingo","Brittle stars"]] + + Define a mask and select rows: + + >>> mask = [True, True, False, None] + >>> table.filter(mask) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022]] + n_legs: [[2,4]] + animals: [["Flamingo","Horse"]] + >>> table.filter(mask, null_selection_behavior="emit_null") + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,null]] + n_legs: [[2,4,null]] + animals: [["Flamingo","Horse",null]] + """ def to_pydict( self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> dict[str, list]: ... + ) -> dict[str, list]: + """ + Convert the Table or RecordBatch to a dict or OrderedDict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + dict + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> table.to_pydict() + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + """ def to_pylist( self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> list[dict[str, Any]]: ... - def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: ... + ) -> list[dict[str, Any]]: + """ + Convert the Table or RecordBatch to a list of rows / dictionaries. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + list + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] + >>> table = pa.table(data, names=["n_legs", "animals"]) + >>> table.to_pylist() + [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... + """ + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: + """ + Return human-readable string representation of Table or RecordBatch. + + Parameters + ---------- + show_metadata : bool, default False + Display Field-level and Schema-level KeyValueMetadata. + preview_cols : int, default 0 + Display values of the columns for the first N columns. + + Returns + ------- + str + """ def remove_column(self, i: int) -> Self: ... - def drop_columns(self, columns: str | list[str]) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new Table or RecordBatch. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Raises + ------ + KeyError + If any of the passed column names do not exist. + + Returns + ------- + Table or RecordBatch + A tabular object without the column(s). + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Drop one column: + + >>> table.drop_columns("animals") + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + + Drop one or more columns: + + >>> table.drop_columns(["n_legs", "animals"]) + pyarrow.Table + ... + ---- + """ def add_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list ) -> Self: ... - def append_column( - self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: ... + def append_column(self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list) -> Self: + """ + Append column at end of columns. -class RecordBatch(_Tabular[Array]): - def validate(self, *, full: bool = False) -> None: ... - def replace_schema_metadata(self, metadata: dict | None = None) -> Self: ... - def get_total_buffer_size(self) -> int: ... - def __sizeof__(self) -> int: ... - def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... - @overload - def rename_columns(self, names: list[str]) -> Self: ... - @overload - def rename_columns(self, names: dict[str, str]) -> Self: ... - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... - def slice(self, offset: int = 0, length: int | None = None) -> Self: ... - def equals(self, other: Self, check_metadata: bool = False) -> bool: ... - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... - def cast( - self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: ... - @classmethod - def from_arrays( - cls, - arrays: Collection[Array], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping | None = None, - ) -> Self: ... - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = None, - preserve_index: bool | None = None, - nthreads: int | None = None, - columns: list[str] | None = None, - ) -> Self: ... - @classmethod - def from_struct_array( - cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: ... - def to_struct_array(self) -> StructArray: ... - def to_tensor( - self, - null_to_nan: bool = False, - row_major: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Tensor: ... - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... - @classmethod - def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... - def __arrow_c_array__(self, requested_schema=None): ... - def __arrow_c_stream__(self, requested_schema=None): ... - @classmethod - def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... - @classmethod - def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... - def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... - @classmethod - def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... - @property - def device_type(self) -> DeviceAllocationType: ... - @property - def is_cpu(self) -> bool: ... - def copy_to(self, destination: MemoryManager | Device) -> Self: ... + Parameters + ---------- + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. -def table_to_blocks(options, table: Table, categories, extension_columns): ... + Returns + ------- + Table or RecordBatch + New table or record batch with the passed column added. -JoinType: TypeAlias = Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", -] + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Append column at the end: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.append_column("year", [year]) + pyarrow.Table + n_legs: int64 + animals: string + year: int64 + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + year: [[2021,2022,2019,2021]] + """ + +class RecordBatch(_Tabular[Array]): + """ + Batch of rows of columns of equal length + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatch.from_*`` functions instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Constructing a RecordBatch from arrays: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Constructing a RecordBatch from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_pandas(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Constructing a RecordBatch from pylist: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + >>> pa.RecordBatch.from_pylist(pylist).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Dog + + You can also construct a RecordBatch using :func:`pyarrow.record_batch`: + + >>> pa.record_batch([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of record batch by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + shallow_copy : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + + Constructing a RecordBatch with schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) + >>> batch.schema + n_legs: int64 + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Shallow copy of a RecordBatch with deleted schema metadata: + + >>> batch.replace_schema_metadata().schema + n_legs: int64 + """ + @property + def num_columns(self) -> int: + """ + Number of columns + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_columns + 2 + """ + + @property + def num_rows(self) -> int: + """ + Number of rows + + Due to the definition of a RecordBatch, all columns have the same + number of rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_rows + 6 + """ + @property + def schema(self) -> Schema: + """ + Schema of the RecordBatch and its columns + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.schema + n_legs: int64 + animals: string + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the record batch. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.nbytes + 116 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the record batch + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.get_total_buffer_size() + 120 + """ + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to RecordBatch at position i. + + A new record batch is returned with the column added, the original record batch + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.add_column(0, "year", year) + pyarrow.RecordBatch + year: int64 + n_legs: int64 + animals: string + ---- + year: [2021,2022,2019,2021] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Original record batch is left unchanged: + + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def remove_column(self, i: int) -> Self: + """ + Create new RecordBatch with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New record batch without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.remove_column(1) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: + """ + Replace column in RecordBatch at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.set_column(1, "year", year) + pyarrow.RecordBatch + n_legs: int64 + year: int64 + ---- + n_legs: [2,4,5,100] + year: [2021,2022,2019,2021] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new record batch with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> new_names = ["n", "name"] + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write RecordBatch to Buffer as encapsulated IPC message, which does not + include a Schema. + + To reconstruct a RecordBatch from the encapsulated IPC message Buffer + returned by this function, a Schema must be passed separately. See + Examples. + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> buf = batch.serialize() + >>> buf + + + Reconstruct RecordBatch from IPC message Buffer and original Schema + + >>> pa.ipc.read_record_batch(buf, batch.schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of record batch to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> batch.slice(offset=3).to_pandas() + n_legs animals + 0 4 Horse + 1 5 Brittle stars + 2 100 Centipede + >>> batch.slice(length=2).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + >>> batch.slice(offset=3, length=1).to_pandas() + n_legs animals + 0 4 Horse + """ + def equals(self, other: Self, check_metadata: bool = False) -> bool: + """ + Check if contents of two record batches are equal. + + Parameters + ---------- + other : pyarrow.RecordBatch + RecordBatch to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch_0 = pa.record_batch([]) + >>> batch_1 = pa.RecordBatch.from_arrays( + ... [n_legs, animals], + ... names=["n_legs", "animals"], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> batch.equals(batch) + True + >>> batch.equals(batch_0) + False + >>> batch.equals(batch_1) + True + >>> batch.equals(batch_1, check_metadata=True) + False + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + + Select columns my indices: + + >>> batch.select([1]) + pyarrow.RecordBatch + animals: string + ---- + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Select columns by names: + + >>> batch.select(["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,2,4,4,5,100] + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast record batch values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast batch values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> batch.cast(target_schema=my_schema) + pyarrow.RecordBatch + n_legs: duration[s] + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + arrays : list of pyarrow.Array + One for each field in RecordBatch + names : list of str, optional + Names for the batch fields. If not passed, schema must be passed + schema : Schema, default None + Schema for the created batch. If not passed, names must be passed + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from pyarrow Arrays using names: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Construct a RecordBatch from pyarrow Arrays using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the RecordBatch. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``RecordBatch``. The default of None will store the index as a + column, except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + + Returns + ------- + pyarrow.RecordBatch + + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Convert pandas DataFrame to RecordBatch: + + >>> import pyarrow as pa + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_pandas(df, schema=my_schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch specifying columns: + + >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a RecordBatch from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``RecordBatch``. + + Parameters + ---------- + struct_array : StructArray + Array to construct the record batch from. + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.RecordBatch.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array(self) -> StructArray: + """ + Convert to a struct array. + """ + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: + """ + Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], + ... names=["a", "b"], + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this schema. + If None, the batch will be returned as-is, with a schema matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the batch as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema + and ArrowArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowDeviceArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this data type. + If None, the batch will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a + C ArrowSchema and ArrowDeviceArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowDeviceArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the arrays in the RecordBatch reside. + + Returns + ------- + DeviceAllocationType + """ + @property + def is_cpu(self) -> bool: + """ + Whether the RecordBatch's arrays are CPU-accessible. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Copy the entire RecordBatch to destination device. + + This copies each column of the record batch to create + a new record batch where all underlying buffers for the columns have + been copied to the destination MemoryManager. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + RecordBatch + """ + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] class Table(_Tabular[ChunkedArray[Any]]): - def validate(self, *, full=False) -> None: ... - def slice(self, offset=0, length=None) -> Self: ... - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: ... - def replace_schema_metadata(self, metadata: dict | None = None) -> Self: ... - def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... - def equals(self, other: Self, check_metadata: bool = False) -> Self: ... + """ + A collection of top-level named, equal length Arrow arrays. + + Warnings + -------- + Do not call this class's constructor directly, use one of the ``from_*`` + methods instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a RecordBatch: + + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a dictionary of arrays: + + >>> pydict = {"n_legs": n_legs, "animals": animals} + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,null]] + animals: [["Flamingo","Centipede"]] + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [ + ... pa.field("year", pa.int64()), + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... ], + ... metadata={"year": "Year of entry"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + year: int64 + n_legs: int64 + animals: string + -- schema metadata -- + year: 'Year of entry' + + Construct a Table with :func:`pyarrow.table`: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + + def validate(self, *, full=False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def slice(self, offset=0, length=None) -> Self: + """ + Compute zero-copy slice of this Table. + + Parameters + ---------- + offset : int, default 0 + Offset from start of table to slice. + length : int, default None + Length of slice (default is until end of table starting from + offset). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.slice(length=3) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019]] + n_legs: [[2,4,5]] + animals: [["Flamingo","Horse","Brittle stars"]] + >>> table.slice(offset=2) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019,2021]] + n_legs: [[5,100]] + animals: [["Brittle stars","Centipede"]] + >>> table.slice(offset=2, length=1) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019]] + n_legs: [[5]] + animals: [["Brittle stars"]] + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the Table. + + Returns a new Table with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.select([0, 1]) + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + >>> table.select(["year"]) + pyarrow.Table + year: int64 + ---- + year: [[2020,2022,2019,2021]] + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None), + which deletes any existing metadata. + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Constructing a Table with pyarrow schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> table = pa.table(df, my_schema) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + + Create a shallow copy of a Table with deleted schema metadata: + + >>> table.replace_schema_metadata().schema + n_legs: int64 + animals: string + + Create a shallow copy of a Table with new schema metadata: + + >>> metadata = {"animals": "Which animal"} + >>> table.replace_schema_metadata(metadata=metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Which animal' + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Flatten this Table. + + Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> month = pa.array([4, 6]) + >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) + >>> table + pyarrow.Table + a: struct + child 0, animals: string + child 1, n_legs: int64 + child 2, year: int64 + month: int64 + ---- + a: [ + -- is_valid: all not null + -- child 0 type: string + ["Parrot",null] + -- child 1 type: int64 + [2,4] + -- child 2 type: int64 + [null,2022]] + month: [[4,6]] + + Flatten the columns with struct field: + + >>> table.flatten() + pyarrow.Table + a.animals: string + a.n_legs: int64 + a.year: int64 + month: int64 + ---- + a.animals: [["Parrot",null]] + a.n_legs: [[2,4]] + a.year: [[null,2022]] + month: [[4,6]] + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + >>> table.combine_chunks() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4,4,5,100]] + animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent table, but where all chunks of + each column share the same dictionary values. Dictionary indices + are transposed accordingly. + + Columns without dictionaries are returned unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> table = pa.table([c_arr], names=["animals"]) + >>> table + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog"] -- indices: + [0,1,2], -- dictionary: + ["Horse","Brittle stars","Centipede"] -- indices: + [0,1,2]] + + Unify dictionaries across both chunks: + + >>> table.unify_dictionaries() + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [0,1,2], -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [3,4,5]] + """ + def equals(self, other: Self, check_metadata: bool = False) -> Self: + """ + Check if contents of two tables are equal. + + Parameters + ---------- + other : pyarrow.Table + Table to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.Table.from_arrays([n_legs, animals], names=names) + >>> table_0 = pa.Table.from_arrays([]) + >>> table_1 = pa.Table.from_arrays( + ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> table.equals(table) + True + >>> table.equals(table_0) + False + >>> table.equals(table_1) + True + >>> table.equals(table_1, check_metadata=True) + False + """ def cast( self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: ... + ) -> Self: + """ + Cast table values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast table values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> table.cast(target_schema=my_schema) + pyarrow.Table + n_legs: duration[s] + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ @classmethod def from_pandas( cls, @@ -793,7 +4153,70 @@ class Table(_Tabular[ChunkedArray[Any]]): nthreads: int | None = None, columns: list[str] | None = None, safe: bool = True, - ) -> Self: ... + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains None/nan objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``Table``. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + safe : bool, default True + Check for overflows or other unsafe conversions. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ @classmethod def from_arrays( cls, @@ -801,31 +4224,634 @@ class Table(_Tabular[ChunkedArray[Any]]): names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping | None = None, - ) -> Self: ... + ) -> Self: + """ + Construct a Table from Arrow arrays. + + Parameters + ---------- + arrays : list of pyarrow.Array or pyarrow.ChunkedArray + Equal-length arrays that should form the table. + names : list of str, optional + Names for the table columns. If not passed, schema must be passed. + schema : Schema, default None + Schema for the created table. If not passed, names must be passed. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"animals": "Name of the animal species"}, + ... ) + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Name of the animal species' + """ @classmethod def from_struct_array( cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: ... + ) -> Self: + """ + Construct a Table from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``Table``. + + Parameters + ---------- + struct_array : StructArray or ChunkedArray + Array to construct the table from. + + Returns + ------- + pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.Table.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ def to_struct_array( self, max_chunksize: int | None = None - ) -> ChunkedArray[scalar.StructScalar]: ... + ) -> ChunkedArray[scalar.StructScalar]: + """ + Convert to a chunked array of struct type. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for ChunkedArray chunks. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + ChunkedArray + """ @classmethod - def from_batches( - cls, batches: Iterable[RecordBatch], schema: Schema | None = None - ) -> Self: ... - def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... - def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... - def get_total_buffer_size(self) -> int: ... + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: + """ + Construct a Table from a sequence or iterator of Arrow RecordBatches. + + Parameters + ---------- + batches : sequence or iterator of RecordBatch + Sequence of RecordBatch to be converted, all schemas must be equal. + schema : Schema, default None + If not passed, will be inferred from the first RecordBatch. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Construct a Table from a RecordBatch: + + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a sequence of RecordBatches: + + >>> pa.Table.from_batches([batch, batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: + """ + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + list[RecordBatch] + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatch: + + >>> table.to_batches()[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Convert a Table to a list of RecordBatches: + + >>> table.to_batches(max_chunksize=2)[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + >>> table.to_batches(max_chunksize=2)[1].to_pandas() + n_legs animals + 0 5 Brittle stars + 1 100 Centipede + """ + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + RecordBatchReader + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatchReader: + + >>> table.to_reader() + + + >>> reader = table.to_reader() + >>> reader.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + >>> reader.read_all() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @property + def schema(self) -> Schema: + """ + Schema of the table and its columns. + + Returns + ------- + Schema + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... + """ + @property + def num_columns(self) -> int: + """ + Number of columns in this table. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_columns + 2 + """ + @property + def num_rows(self) -> int: + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_rows + 4 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the table. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.nbytes + 72 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the table. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.get_total_buffer_size() + 76 + """ def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.add_column(0, "year", [year]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2021,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Original table is left unchanged: + + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def remove_column(self, i: int) -> Self: + """ + Create new Table with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New table without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.remove_column(1) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + """ def set_column( self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: ... + ) -> Self: + """ + Replace column in Table at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.set_column(1, "year", [year]) + pyarrow.Table + n_legs: int64 + year: int64 + ---- + n_legs: [[2,4,5,100]] + year: [[2021,2022,2019,2021]] + """ @overload def rename_columns(self, names: list[str]) -> Self: ... @overload def rename_columns(self, names: dict[str, str]) -> Self: ... - def drop(self, columns: str | list[str]) -> Self: ... - def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: ... + def rename_columns(self, names): + """ + Create new table with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> new_names = ["n", "name"] + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def drop(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new table. + + Alias of Table.drop_columns, but kept for backwards compatibility. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Returns + ------- + Table + New table without the column(s). + """ + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: + """ + Declare a grouping over the columns of the table. + + Resulting grouping can then be used to perform aggregations + with a subsequent ``aggregate()`` method. + + Parameters + ---------- + keys : str or list[str] + Name of the columns that should be used as the grouping key. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the + default), no stable ordering of the output is guaranteed. + + Returns + ------- + TableGroupBy + + See Also + -------- + TableGroupBy.aggregate + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.group_by("year").aggregate([("n_legs", "sum")]) + pyarrow.Table + year: int64 + n_legs_sum: int64 + ---- + year: [[2020,2022,2021,2019]] + n_legs_sum: [[2,6,104,5]] + """ def join( self, right_table: Self, @@ -836,7 +4862,110 @@ class Table(_Tabular[ChunkedArray[Any]]): right_suffix: str | None = None, coalesce_keys: bool = True, use_threads: bool = True, - ) -> Self: ... + ) -> Self: + """ + Perform a join between this table and another one. + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + keys : str or list[str] + The columns from current table that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right tables have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right tables have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whether to use multithreading or not. + + Returns + ------- + Table + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) + >>> df2 = pd.DataFrame( + ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} + ... ) + >>> t1 = pa.Table.from_pandas(df1) + >>> t2 = pa.Table.from_pandas(df2) + + Left outer join: + + >>> t1.join(t2, "id").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2]] + year: [[2019,2020,2022]] + n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] + + Full outer join: + + >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2,4]] + year: [[2019,2020,2022,null]] + n_legs: [[5,null,null,100]] + animal: [["Brittle stars",null,null,"Centipede"]] + + Right outer join: + + >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") + pyarrow.Table + year: int64 + id: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,null]] + id: [[3,4]] + n_legs: [[5,100]] + animal: [["Brittle stars","Centipede"]] + + Right anti join + + >>> t1.join(t2, "id", join_type="right anti") + pyarrow.Table + id: int64 + n_legs: int64 + animal: string + ---- + id: [[4]] + n_legs: [[100]] + animal: [["Centipede"]] + """ def join_asof( self, right_table: Self, @@ -845,10 +4974,109 @@ class Table(_Tabular[ChunkedArray[Any]]): tolerance: int, right_on: str | list[str] | None = None, right_by: str | list[str] | None = None, - ) -> Self: ... - def __arrow_c_stream__(self, requested_schema=None): ... + ) -> Self: + """ + Perform an asof join between this table and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both tables must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + on : str + The column from current table that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current table that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row ``right.on - left.on <= tolerance``. The + ``tolerance`` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_table that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left table. + right_by : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + + Returns + ------- + Table + + Example + -------- + >>> import pyarrow as pa + >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) + >>> t2 = pa.table( + ... { + ... "id": [3, 4], + ... "year": [2020, 2021], + ... "n_legs": [5, 100], + ... "animal": ["Brittle stars", "Centipede"], + ... } + ... ) + + >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[1,3,2,3,3]] + year: [[2020,2021,2022,2022,2023]] + n_legs: [[null,5,null,5,null]] + animal: [[null,"Brittle stars",null,"Brittle stars",null]] + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the table as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ @property - def is_cpu(self) -> bool: ... + def is_cpu(self) -> bool: + """ + Whether all ChunkedArrays are CPU-accessible. + """ def record_batch( data: dict[str, list[Any] | Array[Any]] @@ -859,7 +5087,139 @@ def record_batch( names: list[str] | None = None, schema: Schema | None = None, metadata: Mapping[Any, Any] | None = None, -) -> RecordBatch: ... +) -> RecordBatch: + """ + Create a pyarrow.RecordBatch from another Python data structure or sequence + of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of Arrays, + a pandas DataFame, or any tabular object implementing the + Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or + ``__arrow_c_device_array__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the RecordBatch. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + + Returns + ------- + RecordBatch + + See Also + -------- + RecordBatch.from_arrays, RecordBatch.from_pandas, table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from a python dictionary: + + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Creating a RecordBatch from a list of arrays with names: + + >>> pa.record_batch([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Creating a RecordBatch from a list of arrays with names and metadata: + + >>> my_metadata = {"n_legs": "How many legs does an animal have?"} + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'How many legs does an animal have?' + + Creating a RecordBatch from a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + >>> pa.record_batch(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Creating a RecordBatch from a pandas DataFrame with schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.record_batch(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + >>> pa.record_batch(df, my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + """ + @overload def table( data: dict[str, list[Any] | Array[Any]], @@ -879,14 +5239,224 @@ def table( metadata: Mapping[Any, Any] | None = None, nthreads: int | None = None, ) -> Table: ... +def table(*args, **kwargs): + """ + Create a pyarrow.Table from a Python data structure or sequence of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of arrays or + chunked arrays, a pandas DataFame, or any tabular object implementing + the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, + ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the Arrow Table. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + If passed, the output will have exactly this schema (raising an error + when columns are not found in the data and ignoring additional data not + specified in the schema, when data is a dict or DataFrame). + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + nthreads : int, default None + For pandas.DataFrame inputs: if greater than 1, convert columns to + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). + + Returns + ------- + Table + + See Also + -------- + Table.from_arrays, Table.from_pandas, Table.from_pydict + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from a python dictionary: + + >>> pa.table({"n_legs": n_legs, "animals": animals}) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.table(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.table(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... + + Construct a Table from chunked arrays: + + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + """ + def concat_tables( tables: Iterable[Table], memory_pool: MemoryPool | None = None, promote_options: Literal["none", "default", "permissive"] = "none", **kwargs: Any, -) -> Table: ... +) -> Table: + """ + Concatenate pyarrow.Table objects. + + If promote_options="none", a zero-copy concatenation will be performed. The schemas + of all the Tables must be the same (except the metadata), otherwise an + exception will be raised. The result Table will share the metadata with the + first table. + + If promote_options="default", any null type arrays will be casted to the type of other + arrays in the column of the same name. If a table is missing a particular + field, null values of the appropriate type will be generated to take the + place of the missing field. The new schema will share the metadata with the + first table. Each field in the new schema will share the metadata with the + first table which has the field defined. Note that type promotions may + involve additional allocations on the given ``memory_pool``. + + If promote_options="permissive", the behavior of default plus types will be promoted + to the common denominator that fits all the fields. + + Parameters + ---------- + tables : iterable of pyarrow.Table objects + Pyarrow tables to concatenate into a single Table. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + promote_options : str, default none + Accepts strings "none", "default" and "permissive". + **kwargs : dict, optional + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) + >>> pa.concat_tables([t1, t2]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] + + """ class TableGroupBy: + """ + A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the default), + no stable ordering of the output is guaranteed. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table( + ... [ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], + ... names=["keys", "values"], + ... ) + + Grouping of columns: + + >>> pa.TableGroupBy(t, "keys") + + + Perform aggregations: + + >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + """ + keys: str | list[str] def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... def aggregate( @@ -895,14 +5465,138 @@ class TableGroupBy: tuple[ColumnSelector, Aggregation] | tuple[ColumnSelector, Aggregation, AggregateOptions | None] ], - ) -> Table: ... + ) -> Table: + """ + Perform an aggregation over the grouped columns of the table. + + Parameters + ---------- + aggregations : list[tuple(str, str)] or \ +list[tuple(str, str, FunctionOptions)] + List of tuples, where each tuple is one aggregation specification + and consists of: aggregation column name followed + by function name and optionally aggregation function option. + Pass empty list to get a single row for each group. + The column name can be a string, an empty list or a list of + column names, for unary, nullary and n-ary aggregation functions + respectively. + + For the list of function names and respective aggregation + function options see :ref:`py-grouped-aggrs`. + + Returns + ------- + Table + Results of the aggregation functions. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + + Sum the column "values" over the grouped column "keys": + + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + + Count the rows over the grouped column "keys": + + >>> t.group_by("keys").aggregate([([], "count_all")]) + pyarrow.Table + keys: string + count_all: int64 + ---- + keys: [["a","b","c"]] + count_all: [[2,2,1]] + + Do multiple aggregations: + + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + keys: string + values_sum: int64 + keys_count: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + + Count the number of non-null values for column "values" + over the grouped column "keys": + + >>> import pyarrow.compute as pc + >>> t.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + keys: string + values_count: int64 + ---- + keys: [["a","b","c"]] + values_count: [[2,2,1]] + + Get a single row for each group in column "keys": + + >>> t.group_by("keys").aggregate([]) + pyarrow.Table + keys: string + ---- + keys: [["a","b","c"]] + """ def _table(self) -> Table: ... @property def _use_threads(self) -> bool: ... def concat_batches( recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None -) -> RecordBatch: ... +) -> RecordBatch: + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.record_batch( + ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] + ... ) + >>> pa.concat_batches([t1, t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ __all__ = [ "ChunkedArray", diff --git a/pyarrow-stubs/__lib_pxi/tensor.pyi b/pyarrow-stubs/__lib_pxi/tensor.pyi index f59a2891306..d849abd0f1f 100644 --- a/pyarrow-stubs/__lib_pxi/tensor.pyi +++ b/pyarrow-stubs/__lib_pxi/tensor.pyi @@ -12,29 +12,217 @@ from scipy.sparse import coo_matrix, csr_matrix from sparse import COO class Tensor(_Weakrefable): + """ + A n-dimensional array a.k.a Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + @classmethod - def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... - def to_numpy(self) -> np.ndarray: ... - def equals(self, other: Tensor) -> bool: ... - def dim_name(self, i: int) -> str: ... - @property - def dim_names(self) -> list[str]: ... - @property - def is_mutable(self) -> bool: ... - @property - def is_contiguous(self) -> bool: ... - @property - def ndim(self) -> int: ... - @property - def size(self) -> str: ... - @property - def shape(self) -> tuple[int, ...]: ... - @property - def strides(self) -> tuple[int, ...]: ... + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + Names of each dimension of the Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + def to_numpy(self) -> np.ndarray: + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.to_numpy() + array([[ 2, 2, 4], + [ 4, 5, 100]], dtype=int32) + """ + def equals(self, other: Tensor) -> bool: + """ + Return true if the tensors contains exactly equal data. + + Parameters + ---------- + other : Tensor + The other tensor to compare for equality. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) + >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) + >>> tensor.equals(tensor) + True + >>> tensor.equals(tensor2) + False + """ + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_name(0) + 'dim1' + >>> tensor.dim_name(1) + 'dim2' + """ + @property + def dim_names(self) -> list[str]: + """ + Names of this tensor dimensions. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_names + ['dim1', 'dim2'] + """ + @property + def is_mutable(self) -> bool: + """ + Is this tensor mutable or immutable. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_mutable + True + """ + @property + def is_contiguous(self) -> bool: + """ + Is this tensor contiguous in memory. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_contiguous + True + """ + @property + def ndim(self) -> int: + """ + The dimension (n) of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.ndim + 2 + """ + @property + def size(self) -> str: + """ + The size of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.size + 6 + """ + @property + def shape(self) -> tuple[int, ...]: + """ + The shape of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.shape + (2, 3) + """ + @property + def strides(self) -> tuple[int, ...]: + """ + Strides of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.strides + (12, 4) + """ class SparseCOOTensor(_Weakrefable): @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCOOTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCOOTensor + """ + @classmethod def from_numpy( cls, @@ -42,18 +230,80 @@ class SparseCOOTensor(_Weakrefable): coords: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: ... + ) -> Self: + """ + Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: ... + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The sparse multidimensional array that should be converted. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_tensor(cls, obj: Tensor) -> Self: ... - def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... - def to_scipy(self) -> coo_matrix: ... - def to_pydata_sparse(self) -> COO: ... - def to_tensor(self) -> Tensor: ... - def equals(self, other: Self) -> bool: ... + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> coo_matrix: + """ + Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + """ + def to_pydata_sparse(self) -> COO: + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCOOTensor to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCOOTensor + The other tensor to compare for equality. + """ @property def is_mutable(self) -> bool: ... @property @@ -62,7 +312,19 @@ class SparseCOOTensor(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ @property def dim_names(self) -> list[str]: ... @property @@ -71,8 +333,26 @@ class SparseCOOTensor(_Weakrefable): def has_canonical_format(self) -> bool: ... class SparseCSRMatrix(_Weakrefable): + """ + A sparse CSR matrix. + """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The dense numpy array that should be converted. + dim_names : list, optional + The names of the dimensions. + + Returns + ------- + pyarrow.SparseCSRMatrix + """ @classmethod def from_numpy( cls, @@ -81,15 +361,67 @@ class SparseCSRMatrix(_Weakrefable): indices: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: ... + ) -> Self: + """ + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_tensor(cls, obj: Tensor) -> Self: ... - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... - def to_scipy(self) -> csr_matrix: ... - def to_tensor(self) -> Tensor: ... - def equals(self, other: Self) -> bool: ... + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSRMatrix to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCSRMatrix + The other tensor to compare for equality. + """ @property def is_mutable(self) -> bool: ... @property @@ -98,15 +430,45 @@ class SparseCSRMatrix(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ @property def dim_names(self) -> list[str]: ... @property def non_zero_length(self) -> int: ... class SparseCSCMatrix(_Weakrefable): + """ + A sparse CSC matrix. + """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSCMatrix + """ @classmethod def from_numpy( cls, @@ -115,15 +477,67 @@ class SparseCSCMatrix(_Weakrefable): indices: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: ... + ) -> Self: + """ + Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_tensor(cls, obj: Tensor) -> Self: ... - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... - def to_scipy(self) -> csr_matrix: ... - def to_tensor(self) -> Tensor: ... - def equals(self, other: Self) -> bool: ... + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSCMatrix to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSCMatrix + The other tensor to compare for equality. + """ @property def is_mutable(self) -> bool: ... @property @@ -132,15 +546,52 @@ class SparseCSCMatrix(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ @property def dim_names(self) -> list[str]: ... @property def non_zero_length(self) -> int: ... class SparseCSFTensor(_Weakrefable): + """ + A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. + """ + @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: ... + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSFTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSFTensor + """ @classmethod def from_numpy( cls, @@ -149,14 +600,59 @@ class SparseCSFTensor(_Weakrefable): indices: np.ndarray, shape: tuple[int, ...], dim_names: list[str] | None = None, - ) -> Self: ... - @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: ... + ) -> Self: + """ + Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse tensor. + indptr : numpy.ndarray + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. + indices : numpy.ndarray + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. + shape : tuple + Shape of the matrix. + axis_order : list, optional + the sequence in which dimensions were traversed to + produce the prefix tree. + dim_names : list, optional + Names of the dimensions. + """ @classmethod - def from_tensor(cls, obj: Tensor) -> Self: ... - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... - def to_tensor(self) -> Tensor: ... - def equals(self, other: Self) -> bool: ... + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSFTensor to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSFTensor + The other tensor to compare for equality. + """ @property def is_mutable(self) -> bool: ... @property @@ -165,7 +661,19 @@ class SparseCSFTensor(_Weakrefable): def size(self) -> str: ... @property def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ @property def dim_names(self) -> list[str]: ... @property diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 998dca59f63..414e9ff71c4 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -23,7 +23,7 @@ from pyarrow.lib import ( MonthDayNano, Table, ) -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated from .io import Buffer from .scalar import ExtensionScalar @@ -35,26 +35,156 @@ class _Weakrefable: ... class _Metadata(_Weakrefable): ... class DataType(_Weakrefable): - def field(self, i: int) -> Field: ... + """ + Base class of all Arrow data types. + + Each data type is an *instance* of this class. + + Examples + -------- + Instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + """ + def field(self, i: int) -> Field: + """ + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + """ @property def id(self) -> int: ... @property - def bit_width(self) -> int: ... + def bit_width(self) -> int: + """ + Bit width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().bit_width + 64 + """ @property - def byte_width(self) -> int: ... + def byte_width(self) -> int: + """ + Byte width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().byte_width + 8 + """ @property - def num_fields(self) -> int: ... + def num_fields(self) -> int: + """ + The number of child fields. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().num_fields + 0 + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.string()).num_fields + 1 + >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct.num_fields + 2 + """ @property - def num_buffers(self) -> int: ... + def num_buffers(self) -> int: + """ + Number of data buffers required to construct Array type + excluding children. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().num_buffers + 2 + >>> pa.string().num_buffers + 3 + """ def __hash__(self) -> int: ... - def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: ... - def to_pandas_dtype(self) -> np.generic: ... - def _export_to_c(self, out_ptr: int) -> None: ... + def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: + """ + Return true if type is equivalent to passed value. + + Parameters + ---------- + other : DataType or string convertible to DataType + check_metadata : bool + Whether nested Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().equals(pa.string()) + False + >>> pa.int64().equals(pa.int64()) + True + """ + def to_pandas_dtype(self) -> np.generic: + """ + Return the equivalent NumPy / Pandas dtype. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().to_pandas_dtype() + + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: ... - def __arrow_c_schema__(self) -> Any: ... + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import DataType from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ @classmethod - def _import_from_c_capsule(cls, schema) -> Self: ... + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a DataType from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... @@ -84,89 +214,480 @@ _Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal[" _Tz = TypeVar("_Tz", str, None, default=None) class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + """ + Concrete class for timestamp data types. + + Examples + -------- + >>> import pyarrow as pa + + Create an instance of timestamp type: + + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + + Create an instance of timestamp type with timezone: + + >>> pa.timestamp("s", tz="UTC") + TimestampType(timestamp[s, tz=UTC]) + """ @property - def unit(self) -> _Unit: ... + def unit(self) -> _Unit: + """ + The timestamp unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("us") + >>> t.unit + 'us' + """ @property - def tz(self) -> _Tz: ... + def tz(self) -> _Tz: + """ + The timestamp time zone, if any, or None. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("s", tz="UTC") + >>> t.tz + 'UTC' + """ _Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + """ + Concrete class for time32 data types. + + Supported time unit resolutions are 's' [second] + and 'ms' [millisecond]. + + Examples + -------- + Create an instance of time32 type: + + >>> import pyarrow as pa + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ @property - def unit(self) -> _Time32Unit: ... + def unit(self) -> _Time32Unit: + """ + The time unit ('s' or 'ms'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time32("ms") + >>> t.unit + 'ms' + """ _Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + """ + Concrete class for time64 data types. + + Supported time unit resolutions are 'us' [microsecond] + and 'ns' [nanosecond]. + + Examples + -------- + Create an instance of time64 type: + + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + """ @property - def unit(self) -> _Time64Unit: ... + def unit(self) -> _Time64Unit: + """ + The time unit ('us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time64("us") + >>> t.unit + 'us' + """ class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + """ + Concrete class for duration data types. + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("s") + DurationType(duration[s]) + """ @property - def unit(self) -> _Unit: ... + def unit(self) -> _Unit: + """ + The duration unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.duration("s") + >>> t.unit + 's' + """ + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + """ + Concrete class for fixed-size binary data types. + + Examples + -------- + Create an instance of fixed-size binary type: -class FixedSizeBinaryType(_BasicDataType[Decimal]): ... + >>> import pyarrow as pa + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + """ _Precision = TypeVar("_Precision", default=Any) _Scale = TypeVar("_Scale", default=Any) class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal32 data types. + + Examples + -------- + Create an instance of decimal32 type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + """ @property - def precision(self) -> _Precision: ... + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.precision + 5 + """ @property - def scale(self) -> _Scale: ... + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.scale + 2 + """ class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal64 data types. + + Examples + -------- + Create an instance of decimal64 type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + """ @property - def precision(self) -> _Precision: ... + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.precision + 5 + """ @property - def scale(self) -> _Scale: ... + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.scale + 2 + """ class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal128 data types. + + Examples + -------- + Create an instance of decimal128 type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + """ @property - def precision(self) -> _Precision: ... + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.precision + 5 + """ @property - def scale(self) -> _Scale: ... + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.scale + 2 + """ class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal256 data types. + + Examples + -------- + Create an instance of decimal256 type: + + >>> import pyarrow as pa + >>> pa.decimal256(76, 38) + Decimal256Type(decimal256(76, 38)) + """ @property - def precision(self) -> _Precision: ... + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.precision + 76 + """ @property - def scale(self) -> _Scale: ... + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.scale + 38 + """ class ListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list data types. + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + """ @property - def value_field(self) -> Field[_DataTypeT]: ... + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_field + pyarrow.Field + """ @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataTypeT: + """ + The data type of list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_type + DataType(string) + """ class LargeListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list data types + (like ListType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.string()) + LargeListType(large_list) + """ @property def value_field(self) -> Field[_DataTypeT]: ... @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list(pa.string()).value_type + DataType(string) + """ class ListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ @property - def value_field(self) -> Field[_DataTypeT]: ... + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataTypeT: + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ class LargeListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ @property - def value_field(self) -> Field[_DataTypeT]: ... + def value_field(self) -> Field[_DataTypeT]: + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataTypeT: + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + """ + Concrete class for fixed size list data types. + + Examples + -------- + Create an instance of FixedSizeListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + """ @property - def value_field(self) -> Field[_DataTypeT]: ... + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_field + pyarrow.Field + """ @property - def value_type(self) -> _DataTypeT: ... + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_type + DataType(int32) + """ @property - def list_size(self) -> _Size: ... + def list_size(self) -> _Size: + """ + The size of the fixed size lists. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).list_size + 2 + """ -class DictionaryMemo(_Weakrefable): ... +class DictionaryMemo(_Weakrefable): + """ + Tracking container for dictionary-encoded fields. + """ _IndexT = TypeVar( "_IndexT", @@ -184,62 +705,410 @@ _ValueT = TypeVar("_ValueT", bound=DataType) _Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + """ + Concrete class for dictionary data types. + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + """ + @property - def ordered(self) -> _Ordered: ... + def ordered(self) -> _Ordered: + """ + Whether the dictionary is ordered, i.e. whether the ordering of values + in the dictionary is important. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()).ordered + False + """ @property - def index_type(self) -> _IndexT: ... + def index_type(self) -> _IndexT: + """ + The data type of dictionary indices (a signed integer type). + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).index_type + DataType(int16) + """ @property - def value_type(self) -> _BasicValueT: ... + def value_type(self) -> _BasicValueT: + """ + The dictionary value type. + + The dictionary values are found in an instance of DictionaryArray. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).value_type + DataType(string) + """ _K = TypeVar("_K", bound=DataType) class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + """ + Concrete class for map data types. + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + """ + @property - def key_field(self) -> Field[_K]: ... + def key_field(self) -> Field[_K]: + """ + The field for keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_field + pyarrow.Field + """ @property - def key_type(self) -> _K: ... + def key_type(self) -> _K: + """ + The data type of keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_type + DataType(string) + """ @property - def item_field(self) -> Field[_ValueT]: ... + def item_field(self) -> Field[_ValueT]: + """ + The field for items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_field + pyarrow.Field + """ @property - def item_type(self) -> _ValueT: ... + def item_type(self) -> _ValueT: + """ + The data type of items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_type + DataType(int32) + """ @property - def keys_sorted(self) -> _Ordered: ... + def keys_sorted(self) -> _Ordered: + """ + Should the entries be sorted according to keys. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted + True + """ -_Size = TypeVar("_Size") +_Size = TypeVar("_Size", default=int) class StructType(DataType): - def get_field_index(self, name: str) -> int: ... - def field(self, i: int | str) -> Field: ... - def get_all_field_indices(self, name: str) -> list[int]: ... + """ + Concrete class for struct data types. + + ``StructType`` supports direct indexing using ``[...]`` (implemented via + ``__getitem__``) to access its fields. + It will return the struct field with the given index or name. + + Examples + -------- + >>> import pyarrow as pa + + Accessing fields using direct indexing: + + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type[0] + pyarrow.Field + >>> struct_type["y"] + pyarrow.Field + + Accessing fields using ``field()``: + + >>> struct_type.field(1) + pyarrow.Field + >>> struct_type.field("x") + pyarrow.Field + + # Creating a schema from the struct type's fields: + >>> pa.schema(list(struct_type)) + x: int32 + y: string + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Index of the field with a name 'y': + + >>> struct_type.get_field_index("y") + 1 + + Index of the field that does not exist: + + >>> struct_type.get_field_index("z") + -1 + """ + def field(self, i: int | str) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or str + + Returns + ------- + pyarrow.Field + + Examples + -------- + + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Select the second field: + + >>> struct_type.field(1) + pyarrow.Field + + Select the field named 'x': + + >>> struct_type.field("x") + pyarrow.Field + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type.get_all_field_indices("x") + [0] + """ def __len__(self) -> int: ... def __iter__(self) -> Iterator[Field]: ... __getitem__ = field # pyright: ignore[reportUnknownVariableType] @property - def names(self) -> list[str]: ... + def names(self) -> list[str]: + """ + Lists the field names. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.names + ['a', 'b', 'c'] + """ @property - def fields(self) -> list[Field]: ... + def fields(self) -> list[Field]: + """ + Lists all fields within the StructType. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.fields + [pyarrow.Field, pyarrow.Field, pyarrow.Field] + """ class UnionType(DataType): + """ + Base class for union data types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + + Create an instance of a sparse UnionType using ``pa.union``: + + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ @property - def mode(self) -> Literal["sparse", "dense"]: ... + def mode(self) -> Literal["sparse", "dense"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.mode + 'sparse' + """ @property - def type_codes(self) -> list[int]: ... + def type_codes(self) -> list[int]: + """ + The type code to indicate each data type in this union. + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.type_codes + [0, 1] + """ def __len__(self) -> int: ... def __iter__(self) -> Iterator[Field]: ... - def field(self, i: int) -> Field: ... + def field(self, i: int) -> Field: + """ + Return a child field by its numeric index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union[0] + pyarrow.Field + """ __getitem__ = field # pyright: ignore[reportUnknownVariableType] class SparseUnionType(UnionType): + """ + Concrete class for sparse union types. + + Examples + -------- + Create an instance of a sparse UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ @property def mode(self) -> Literal["sparse"]: ... class DenseUnionType(UnionType): + """ + Concrete class for dense union types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + """ + @property def mode(self) -> Literal["dense"]: ... _RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + """ + Concrete class for run-end encoded types. + """ @property def run_end_type(self) -> _RunEndType: ... @property @@ -248,52 +1117,421 @@ class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): _StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) class BaseExtensionType(DataType): - def __arrow_ext_class__(self) -> type[ExtensionArray]: ... - def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + """ + Concrete base class for extension types. + """ + def __arrow_ext_class__(self) -> type[ExtensionArray]: + """ + The associated array extension class + """ + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: + """ + The associated scalar class + """ @property - def extension_name(self) -> str: ... + def extension_name(self) -> str: + """ + The extension type name. + """ @property - def storage_type(self) -> DataType: ... + def storage_type(self) -> DataType: + """ + The underlying storage type. + """ def wrap_array(self, storage: _StorageT) -> _StorageT: ... class ExtensionType(BaseExtensionType): + """ + Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + The underlying storage type for the extension type. + extension_name : str + A unique name distinguishing this extension type. The name will be + used when deserializing IPC data. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Create an instance of RationalType extension type: + + >>> rational_type = RationalType(pa.int32()) + + Inspect the extension type: + + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + Wrap an array as an extension array: + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, + ... ) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Or do the same with creating an ExtensionArray: + + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. + """ + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... - def __arrow_ext_serialize__(self) -> bytes: ... + def __arrow_ext_serialize__(self) -> bytes: + """ + Serialized representation of metadata to reconstruct the type object. + + This method should return a bytes object, and those serialized bytes + are stored in the custom metadata of the Field holding an extension + type in an IPC message. + The bytes are passed to ``__arrow_ext_deserialize`` and should hold + sufficient information to reconstruct the data type instance. + """ @classmethod - def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: ... + def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: + """ + Return an extension type instance from the storage type and serialized + metadata. + + This method should return an instance of the ExtensionType subclass + that matches the passed storage type and serialized metadata (the + return value of ``__arrow_ext_serialize__``). + """ class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + """ + Concrete class for fixed shape tensor extension type. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) + FixedShapeTensorType(extension) + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ @property - def value_type(self) -> _ValueT: ... + def value_type(self) -> _ValueT: + """ + Data type of an individual tensor. + """ @property - def shape(self) -> list[int]: ... + def shape(self) -> list[int]: + """ + Shape of the tensors. + """ @property - def dim_names(self) -> list[str] | None: ... + def dim_names(self) -> list[str] | None: + """ + Explicit names of the dimensions. + """ @property - def permutation(self) -> list[int] | None: ... + def permutation(self) -> list[int] | None: + """ + Indices of the dimensions ordering. + """ + +class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ -class Bool8Type(BaseExtensionType): ... -class UuidType(BaseExtensionType): ... -class JsonType(BaseExtensionType): ... +class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + +class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ class OpaqueType(BaseExtensionType): + """ + Concrete class for opaque extension type. + + Opaque is a placeholder for a type from an external (often non-Arrow) + system that could not be interpreted. + + Examples + -------- + Create an instance of opaque extension type: + + >>> import pyarrow as pa + >>> pa.opaque(pa.int32(), "geometry", "postgis") + OpaqueType(extension) + """ @property - def type_name(self) -> str: ... + def type_name(self) -> str: + """ + The name of the type in the external system. + """ @property - def vendor_name(self) -> str: ... + def vendor_name(self) -> str: + """ + The name of the external system. + """ +@deprecated( + "This class is deprecated and its deserialization is disabled by default. " + ":class:`ExtensionType` is recommended instead." +) class PyExtensionType(ExtensionType): + """ + Concrete base class for Python-defined extension types based on pickle + for (de)serialization. + + .. warning:: + This class is deprecated and its deserialization is disabled by default. + :class:`ExtensionType` is recommended instead. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + """ def __init__(self, storage_type: DataType) -> None: ... @classmethod - def set_auto_load(cls, value: bool) -> None: ... + def set_auto_load(cls, value: bool) -> None: + """ + Enable or disable auto-loading of serialized PyExtensionType instances. + + Parameters + ---------- + value : bool + Whether to enable auto-loading. + """ -class UnknownExtensionType(PyExtensionType): +class UnknownExtensionType(PyExtensionType): # type: ignore + """ + A concrete class for Python-defined extension types that refer to + an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. + """ def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... -def register_extension_type(ext_type: PyExtensionType) -> None: ... -def unregister_extension_type(type_name: str) -> None: ... +def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore + """ + Register a Python extension type. + + Registration is based on the extension name (so different registered types + need unique extension names). Registration needs an extension type + instance, but then works for any instance of the same subclass regardless + of parametrization of the type. + + Parameters + ---------- + ext_type : BaseExtensionType instance + The ExtensionType subclass to register. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +def unregister_extension_type(type_name: str) -> None: + """ + Unregister a Python extension type. + + Parameters + ---------- + type_name : str + The name of the ExtensionType subclass to unregister. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + """ + KeyValueMetadata + + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... def equals(self, other: KeyValueMetadata) -> bool: ... def __len__(self) -> int: ... @@ -301,145 +1539,1884 @@ class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): def __getitem__(self, __key: Any) -> Any: ... def __iter__(self) -> Iterator[bytes]: ... def get_all(self, key: str) -> list[bytes]: ... - def to_dict(self) -> dict[bytes, bytes]: ... + def to_dict(self) -> dict[bytes, bytes]: + """ + Convert KeyValueMetadata to dict. If a key occurs twice, the value for + the first one is returned + """ def ensure_metadata( meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False ) -> KeyValueMetadata | None: ... -class Field(_Weakrefable, Generic[_DataTypeT]): - def equals(self, other: Field, check_metadata: bool = False) -> bool: ... - def __hash__(self) -> int: ... - @property - def nullable(self) -> bool: ... - @property - def name(self) -> str: ... - @property - def metadata(self) -> dict[bytes, bytes] | None: ... - @property - def type(self) -> _DataTypeT: ... - def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: ... - def remove_metadata(self) -> None: ... - def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: ... - def with_name(self, name: str) -> Self: ... - def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... - def flatten(self) -> list[Field]: ... - def _export_to_c(self, out_ptr: int) -> None: ... - @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: ... - def __arrow_c_schema__(self) -> Any: ... - @classmethod - def _import_from_c_capsule(cls, schema) -> Self: ... +class Field(_Weakrefable, Generic[_DataTypeT]): + """ + A named field, with a data type, nullability, and optional metadata. + + Notes + ----- + Do not use this class's constructor directly; use pyarrow.field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + """ + + def equals(self, other: Field, check_metadata: bool = False) -> bool: + """ + Test if this field is equal to the other + + Parameters + ---------- + other : pyarrow.Field + check_metadata : bool, default False + Whether Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.equals(f2) + False + >>> f1.equals(f1) + True + """ + def __hash__(self) -> int: ... + @property + def nullable(self) -> bool: + """ + The field nullability. + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.nullable + True + >>> f2.nullable + False + """ + @property + def name(self) -> str: + """ + The field name. + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field.name + 'key' + """ + @property + def metadata(self) -> dict[bytes, bytes] | None: + """ + The field metadata (if any is set). + + Returns + ------- + metadata : dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + """ + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: + """ + Add metadata as dict of string keys and values to Field + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + + Create new field by adding metadata to existing one: + + >>> field_new = field.with_metadata({"key": "Something important"}) + >>> field_new + pyarrow.Field + >>> field_new.metadata + {b'key': b'Something important'} + """ + def remove_metadata(self) -> None: + """ + Create new field without metadata, if any + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + + Create new field by removing the metadata from the existing one: + + >>> field_new = field.remove_metadata() + >>> field_new.metadata + """ + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced type + + Parameters + ---------- + new_type : pyarrow.DataType + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing type of an existing one: + + >>> field_new = field.with_type(pa.int64()) + >>> field_new + pyarrow.Field + """ + def with_name(self, name: str) -> Self: + """ + A copy of this field with the replaced name + + Parameters + ---------- + name : str + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing the name of an existing one: + + >>> field_new = field.with_name("lock") + >>> field_new + pyarrow.Field + """ + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced nullability + + Parameters + ---------- + nullable : bool + + Returns + ------- + field: pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + >>> field.nullable + True + + Create new field by replacing the nullability of an existing one: + + >>> field_new = field.with_nullable(False) + >>> field_new + pyarrow.Field + >>> field_new.nullable + False + """ + def flatten(self) -> list[Field]: + """ + Flatten this field. If a struct field, individual child fields + will be returned with their names prefixed by the parent's name. + + Returns + ------- + fields : List[pyarrow.Field] + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("bar", pa.float64(), nullable=False) + >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) + >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) + + Flatten a struct field: + + >>> ff + pyarrow.Field not null> + >>> ff.flatten() + [pyarrow.Field, pyarrow.Field] + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import Field from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a Field from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class Schema(_Weakrefable): + """ + A named collection of types a.k.a schema. A schema defines the + column names and types in a record batch or table data structure. + They also contain metadata about the columns. For example, schemas + converted from Pandas contain metadata about their original Pandas + types so they can be converted back to the same types. + + Warnings + -------- + Do not call this class's constructor directly. Instead use + :func:`pyarrow.schema` factory function which makes a new Arrow + Schema object. + + Examples + -------- + Create a new Arrow Schema object: + + >>> import pyarrow as pa + >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + some_int: int32 + some_string: string + + Create Arrow Schema with metadata: + + >>> pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] + def __iter__(self) -> Iterator[Field]: ... + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: + """ + Return deserialized-from-JSON pandas metadata field (if it exists) + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> schema = pa.Table.from_pandas(df).schema + + Select pandas metadata field from Arrow Schema: + + >>> schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... + """ + @property + def names(self) -> list[str]: + """ + The schema's field names. + + Returns + ------- + list of str + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the names of the schema's fields: + + >>> schema.names + ['n_legs', 'animals'] + """ + @property + def types(self) -> list[DataType]: + """ + The schema's field types. + + Returns + ------- + list of DataType + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the types of the schema's fields: + + >>> schema.types + [DataType(int64), DataType(string)] + """ + @property + def metadata(self) -> dict[bytes, bytes]: + """ + The schema's metadata (if any is set). + + Returns + ------- + metadata: dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + + Get the metadata of the schema's fields: + + >>> schema.metadata + {b'n_legs': b'Number of legs per animal'} + """ + def empty_table(self) -> Table: + """ + Provide an empty table according to the schema. + + Returns + ------- + table: pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Create an empty table with schema's fields: + + >>> schema.empty_table() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[]] + animals: [[]] + """ + def equals(self, other: Schema, check_metadata: bool = False) -> bool: + """ + Test if this schema is equal to the other + + Parameters + ---------- + other : pyarrow.Schema + check_metadata : bool, default False + Key/value metadata must be equal too + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> schema1 = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + + Test two equal schemas: + + >>> schema1.equals(schema1) + True + + Test two unequal schemas: + + >>> schema1.equals(schema2) + False + """ + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: + """ + Returns implied schema from dataframe + + Parameters + ---------- + df : pandas.DataFrame + preserve_index : bool, default True + Whether to store the index as an additional column (or columns, for + MultiIndex) in the resulting `Table`. + The default of None will store the index as a column, except for + RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) + + Create an Arrow Schema from the schema of a pandas dataframe: + + >>> pa.Schema.from_pandas(df) + int: int64 + str: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, ... + """ + def field(self, i: int | str | bytes) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or string + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Select the second field: + + >>> schema.field(1) + pyarrow.Field + + Select the field of the column named 'n_legs': + + >>> schema.field("n_legs") + pyarrow.Field + """ + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: + """ + DEPRECATED + + Parameters + ---------- + name : str + + Returns + ------- + field: pyarrow.Field + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the index of the field named 'animals': + + >>> schema.get_field_index("animals") + 1 + + Index in case of several fields with the given name: + + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema.get_field_index("animals") + -1 + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ] + ... ) + + Get the indexes of the fields named 'animals': + + >>> schema.get_all_field_indices("animals") + [1, 2] + """ + def append(self, field: Field) -> Schema: + """ + Append a field at the end of the schema. + + In contrast to Python's ``list.append()`` it does return a new + object, leaving the original Schema unmodified. + + Parameters + ---------- + field : Field + + Returns + ------- + schema: Schema + New object with appended field. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Append a field 'extra' at the end of the schema: + + >>> schema_new = schema.append(pa.field("extra", pa.bool_())) + >>> schema_new + n_legs: int64 + animals: string + extra: bool + + Original schema is unmodified: + + >>> schema + n_legs: int64 + animals: string + """ + def insert(self, i: int, field: Field) -> Schema: + """ + Add a field at position i to the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Insert a new field on the second position: + + >>> schema.insert(1, pa.field("extra", pa.bool_())) + n_legs: int64 + extra: bool + animals: string + """ + def remove(self, i: int) -> Schema: + """ + Remove the field at index i from the schema. + + Parameters + ---------- + i : int + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Remove the second field of the schema: + + >>> schema.remove(1) + n_legs: int64 + """ + def set(self, i: int, field: Field) -> Schema: + """ + Replace a field at position i in the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Replace the second field of the schema with a new field 'extra': + + >>> schema.set(1, pa.field("replaced", pa.bool_())) + n_legs: int64 + replaced: bool + """ + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: + """ + DEPRECATED + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + """ + def with_metadata(self, metadata: dict) -> Schema: + """ + Add metadata as dict of string keys and values to Schema + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Add metadata to existing schema field: + + >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write Schema to Buffer as encapsulated IPC message + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Write schema to Buffer: + + >>> schema.serialize() + + """ + def remove_metadata(self) -> Schema: + """ + Create new schema without metadata, if any + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Create a new schema with removing the metadata from the original: + + >>> schema.remove_metadata() + n_legs: int64 + animals: string + """ + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: + """ + Return human-readable representation of Schema + + Parameters + ---------- + truncate_metadata : boolean, default True + Limit metadata key/value display to a single line of ~80 characters + or less + show_field_metadata : boolean, default True + Display Field-level KeyValueMetadata + show_schema_metadata : boolean, default True + Display Schema-level KeyValueMetadata + + Returns + ------- + str : the formatted output + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: + """ + Import Schema from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: + """ + Import a Schema from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: + """ + Unify schemas by merging fields by name. + + The resulting schema will contain the union of fields from all schemas. + Fields with the same name will be merged. Note that two fields with + different types will fail merging by default. + + - The unified field will inherit the metadata from the schema where + that field is first defined. + - The first N fields in the schema will be ordered the same as the + N fields in the first schema. + + The resulting schema will inherit its metadata from the first input + schema. + + Parameters + ---------- + schemas : list of Schema + Schemas to merge into a single one. + promote_options : str, default default + Accepts strings "default" and "permissive". + Default: null and only null can be unified with another type. + Permissive: types are promoted to the greater common denominator. + + Returns + ------- + Schema + + Raises + ------ + ArrowInvalid : + If any input schema contains fields with duplicate names. + If Fields of the same name are not mergeable. + """ + +@overload +def field(name: SupportArrowSchema) -> Field[Any]: ... +@overload +def field( + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT]: ... +def field(*args, **kwargs): + """ + Create a pyarrow.Field instance. + + Parameters + ---------- + name : str or bytes + Name of the field. + Alternatively, you can also pass an object that implements the Arrow + PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). + type : pyarrow.DataType or str + Arrow datatype of the field or a string matching one. + nullable : bool, default True + Whether the field's values are nullable. + metadata : dict, default None + Optional field metadata, the keys and values must be coercible to + bytes. + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + + A str can also be passed for the type parameter: + + >>> pa.field("key", "int32") + pyarrow.Field + """ + +def null() -> NullType: + """ + Create instance of null type. + + Examples + -------- + Create an instance of a null type: + + >>> import pyarrow as pa + >>> pa.null() + DataType(null) + >>> print(pa.null()) + null + + Create a ``Field`` type with a null type and a name: + + >>> pa.field("null_field", pa.null()) + pyarrow.Field + """ + +def bool_() -> BoolType: + """ + Create instance of boolean type. + + Examples + -------- + Create an instance of a boolean type: + + >>> import pyarrow as pa + >>> pa.bool_() + DataType(bool) + >>> print(pa.bool_()) + bool + + Create a ``Field`` type with a boolean type + and a name: + + >>> pa.field("bool_field", pa.bool_()) + pyarrow.Field + """ + +def uint8() -> UInt8Type: + """ + Create instance of unsigned int8 type. + + Examples + -------- + Create an instance of unsigned int8 type: + + >>> import pyarrow as pa + >>> pa.uint8() + DataType(uint8) + >>> print(pa.uint8()) + uint8 + + Create an array with unsigned int8 type: + + >>> pa.array([0, 1, 2], type=pa.uint8()) + + [ + 0, + 1, + 2 + ] + """ + +def int8() -> Int8Type: + """ + Create instance of signed int8 type. + + Examples + -------- + Create an instance of int8 type: + + >>> import pyarrow as pa + >>> pa.int8() + DataType(int8) + >>> print(pa.int8()) + int8 + + Create an array with int8 type: + + >>> pa.array([0, 1, 2], type=pa.int8()) + + [ + 0, + 1, + 2 + ] + """ + +def uint16() -> UInt16Type: + """ + Create instance of unsigned uint16 type. + + Examples + -------- + Create an instance of unsigned int16 type: + + >>> import pyarrow as pa + >>> pa.uint16() + DataType(uint16) + >>> print(pa.uint16()) + uint16 + + Create an array with unsigned int16 type: + + >>> pa.array([0, 1, 2], type=pa.uint16()) + + [ + 0, + 1, + 2 + ] + """ + +def int16() -> Int16Type: + """ + Create instance of signed int16 type. + + Examples + -------- + Create an instance of int16 type: + + >>> import pyarrow as pa + >>> pa.int16() + DataType(int16) + >>> print(pa.int16()) + int16 + + Create an array with int16 type: + + >>> pa.array([0, 1, 2], type=pa.int16()) + + [ + 0, + 1, + 2 + ] + """ + +def uint32() -> Uint32Type: + """ + Create instance of unsigned uint32 type. + + Examples + -------- + Create an instance of unsigned int32 type: + + >>> import pyarrow as pa + >>> pa.uint32() + DataType(uint32) + >>> print(pa.uint32()) + uint32 + + Create an array with unsigned int32 type: + + >>> pa.array([0, 1, 2], type=pa.uint32()) + + [ + 0, + 1, + 2 + ] + """ + +def int32() -> Int32Type: + """ + Create instance of signed int32 type. + + Examples + -------- + Create an instance of int32 type: + + >>> import pyarrow as pa + >>> pa.int32() + DataType(int32) + >>> print(pa.int32()) + int32 + + Create an array with int32 type: + + >>> pa.array([0, 1, 2], type=pa.int32()) + + [ + 0, + 1, + 2 + ] + """ + +def int64() -> Int64Type: + """ + Create instance of signed int64 type. + + Examples + -------- + Create an instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> print(pa.int64()) + int64 + + Create an array with int64 type: + + >>> pa.array([0, 1, 2], type=pa.int64()) + + [ + 0, + 1, + 2 + ] + """ + +def uint64() -> UInt64Type: + """ + Create instance of unsigned uint64 type. + + Examples + -------- + Create an instance of unsigned int64 type: + + >>> import pyarrow as pa + >>> pa.uint64() + DataType(uint64) + >>> print(pa.uint64()) + uint64 + + Create an array with unsigned uint64 type: + + >>> pa.array([0, 1, 2], type=pa.uint64()) + + [ + 0, + 1, + 2 + ] + """ + +def tzinfo_to_string(tz: dt.tzinfo) -> str: + """ + Converts a time zone object into a string indicating the name of a time + zone, one of: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + tz : datetime.tzinfo + Time zone object + + Returns + ------- + name : str + Time zone name + """ + +def string_to_tzinfo(name: str) -> dt.tzinfo: + """ + Convert a time zone name into a time zone object. + + Supported input strings are: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + name: str + Time zone name. + + Returns + ------- + tz : datetime.tzinfo + Time zone object + """ + +@overload +def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... +@overload +def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def timestamp(*args, **kwargs): + """ + Create instance of timestamp type with resolution and optional time zone. + + Parameters + ---------- + unit : str + one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' + [nanosecond] + tz : str, default None + Time zone name. None indicates time zone naive + + Examples + -------- + Create an instance of timestamp type: + + >>> import pyarrow as pa + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + >>> pa.timestamp("s", tz="America/New_York") + TimestampType(timestamp[s, tz=America/New_York]) + >>> pa.timestamp("s", tz="+07:30") + TimestampType(timestamp[s, tz=+07:30]) + + Use timestamp type when creating a scalar object: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) + + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) + + + Returns + ------- + timestamp_type : TimestampType + """ + +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: + """ + Create instance of 32-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + one of 's' [second], or 'ms' [millisecond] + + Returns + ------- + type : pyarrow.Time32Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time32("s") + Time32Type(time32[s]) + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: + """ + Create instance of 64-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + One of 'us' [microsecond], or 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.Time64Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + >>> pa.time64("ns") + Time64Type(time64[ns]) + """ + +def duration(unit: _Unit) -> DurationType[_Unit]: + """ + Create instance of a duration type with unit resolution. + + Parameters + ---------- + unit : str + One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or + 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.DurationType + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("us") + DurationType(duration[us]) + >>> pa.duration("s") + DurationType(duration[s]) + + Create an array with duration type: + + >>> pa.array([0, 1, 2], type=pa.duration("s")) + + [ + 0, + 1, + 2 + ] + """ + +def month_day_nano_interval() -> MonthDayNanoIntervalType: + """ + Create instance of an interval type representing months, days and + nanoseconds between two dates. + + Examples + -------- + Create an instance of an month_day_nano_interval type: + + >>> import pyarrow as pa + >>> pa.month_day_nano_interval() + DataType(month_day_nano_interval) + + Create a scalar with month_day_nano_interval type: + + >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) + + """ + +def date32() -> Date32Type: + """ + Create instance of 32-bit date (days since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 32-bit date type: + + >>> import pyarrow as pa + >>> pa.date32() + DataType(date32[day]) + + Create a scalar with 32-bit date type: + + >>> from datetime import date + >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) + + """ + +def date64() -> Date64Type: + """ + Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 64-bit date type: + + >>> import pyarrow as pa + >>> pa.date64() + DataType(date64[ms]) + + Create a scalar with 64-bit date type: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) + + """ + +def float16() -> Float16Type: + """ + Create half-precision floating point type. + + Examples + -------- + Create an instance of float16 type: + + >>> import pyarrow as pa + >>> pa.float16() + DataType(halffloat) + >>> print(pa.float16()) + halffloat + + Create an array with float16 type: + + >>> arr = np.array([1.5, np.nan], dtype=np.float16) + >>> a = pa.array(arr, type=pa.float16()) + >>> a + + [ + 15872, + 32256 + ] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [, ] + """ + +def float32() -> Float32Type: + """ + Create single-precision floating point type. + + Examples + -------- + Create an instance of float32 type: + + >>> import pyarrow as pa + >>> pa.float32() + DataType(float) + >>> print(pa.float32()) + float + + Create an array with float32 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) + + [ + 0, + 1, + 2 + ] + """ + +def float64() -> Float64Type: + """ + Create double-precision floating point type. + + Examples + -------- + Create an instance of float64 type: + + >>> import pyarrow as pa + >>> pa.float64() + DataType(double) + >>> print(pa.float64()) + double -class Schema(_Weakrefable): - def __len__(self) -> int: ... - def __getitem__(self, key: str) -> Field: ... - _field = __getitem__ # pyright: ignore[reportUnknownVariableType] - def __iter__(self) -> Iterator[Field]: ... - def __hash__(self) -> int: ... - def __sizeof__(self) -> int: ... - @property - def pandas_metadata(self) -> dict: ... - @property - def names(self) -> list[str]: ... - @property - def types(self) -> list[DataType]: ... - @property - def metadata(self) -> dict[bytes, bytes]: ... - def empty_table(self) -> Table: ... - def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... - @classmethod - def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: ... - def field(self, i: int | str | bytes) -> Field: ... - def field_by_name(self, name: str) -> Field: ... - def get_field_index(self, name: str) -> int: ... - def get_all_field_indices(self, name: str) -> list[int]: ... - def append(self, field: Field) -> Schema: ... - def insert(self, i: int, field: Field) -> Schema: ... - def remove(self, i: int) -> Schema: ... - def set(self, i: int, field: Field) -> Schema: ... - def add_metadata(self, metadata: dict) -> Schema: ... - def with_metadata(self, metadata: dict) -> Schema: ... - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... - def remove_metadata(self) -> Schema: ... - def to_string( - self, - truncate_metadata: bool = True, - show_field_metadata: bool = True, - show_schema_metadata: bool = True, - ) -> str: ... - def _export_to_c(self, out_ptr: int) -> None: ... - @classmethod - def _import_from_c(cls, in_ptr: int) -> Schema: ... - def __arrow_c_schema__(self) -> Any: ... - @staticmethod - def _import_from_c_capsule(schema: Any) -> Schema: ... + Create an array with float64 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) + + [ + 0, + 1, + 2 + ] + """ -def unify_schemas( - schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" -) -> Schema: ... -@overload -def field(name: SupportArrowSchema) -> Field[Any]: ... -@overload -def field( - name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None -) -> Field[_DataTypeT]: ... -def null() -> NullType: ... -def bool_() -> BoolType: ... -def uint8() -> UInt8Type: ... -def int8() -> Int8Type: ... -def uint16() -> UInt16Type: ... -def int16() -> Int16Type: ... -def uint32() -> Uint32Type: ... -def int32() -> Int32Type: ... -def int64() -> Int64Type: ... -def uint64() -> UInt64Type: ... -def tzinfo_to_string(tz: dt.tzinfo) -> str: ... -def string_to_tzinfo(name: str) -> dt.tzinfo: ... -@overload -def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... -@overload -def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... -def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: ... -def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: ... -def duration(unit: _Unit) -> DurationType[_Unit]: ... -def month_day_nano_interval() -> MonthDayNanoIntervalType: ... -def date32() -> Date32Type: ... -def date64() -> Date64Type: ... -def float16() -> Float16Type: ... -def float32() -> Float32Type: ... -def float64() -> Float64Type: ... @overload def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... @overload def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... +def decimal32(*args, **kwargs): + """ + Create decimal type with precision and scale and 32-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal32(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 32-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal32(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 32-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 9 significant digits, consider + using ``decimal64``, ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 9 + scale : int + + Returns + ------- + decimal_type : Decimal32Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal32(5, 2)) + + [ + 123.45 + ] + """ + @overload def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... @overload def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... +def decimal64(*args, **kwargs): + """ + Create decimal type with precision and scale and 64-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal64(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 64-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal64(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 64-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 18 significant digits, consider + using ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 18 + scale : int + + Returns + ------- + decimal_type : Decimal64Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal64(5, 2)) + + [ + 123.45 + ] + """ + @overload def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... @overload def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... +def decimal128(*args, **kwargs): + """ + Create decimal type with precision and scale and 128-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal128(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 128-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal128(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 128-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 38 significant digits, consider + using ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 38 + scale : int + + Returns + ------- + decimal_type : Decimal128Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal128(5, 2)) + + [ + 123.45 + ] + """ + @overload def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... @overload def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... -def string() -> StringType: ... +def decimal256(*args, **kwargs): + """ + Create decimal type with precision and scale and 256-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + For most use cases, the maximum precision offered by ``decimal128`` + is sufficient, and it will result in a more compact and more efficient + encoding. ``decimal256`` is useful if you need a precision higher + than 38 significant digits. + + Parameters + ---------- + precision : int + Must be between 1 and 76 + scale : int + + Returns + ------- + decimal_type : Decimal256Type + """ + +def string() -> StringType: + """ + Create UTF8 variable-length string type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string() + DataType(string) + + and use the string type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.string()) + + [ + "foo", + "bar", + "baz" + ] + """ utf8 = string +""" +Alias for string(). + +Examples +-------- +Create an instance of a string type: + +>>> import pyarrow as pa +>>> pa.utf8() +DataType(string) + +and use the string type to create an array: + +>>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + +[ + "foo", + "bar", + "baz" +] +""" @overload def binary(length: Literal[-1] = ...) -> BinaryType: ... @overload def binary(length: int) -> FixedSizeBinaryType: ... -def large_binary() -> LargeBinaryType: ... -def large_string() -> LargeStringType: ... +def binary(length): + """ + Create variable-length or fixed size binary type. + + Parameters + ---------- + length : int, optional, default -1 + If length == -1 then return a variable length binary type. If length is + greater than or equal to 0 then return a fixed size binary type of + width `length`. + + Examples + -------- + Create an instance of a variable-length binary type: + + >>> import pyarrow as pa + >>> pa.binary() + DataType(binary) + + and use the variable-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) + + [ + 666F6F, + 626172, + 62617A + ] + + Create an instance of a fixed-size binary type: + + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + + and use the fixed-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_binary() -> LargeBinaryType: + """ + Create large variable-length binary type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer binary(). + + Examples + -------- + Create an instance of large variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_binary() + DataType(large_binary) + + and use the type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_string() -> LargeStringType: + """ + Create large UTF8 variable-length string type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer string(). + + Examples + -------- + Create an instance of large UTF8 variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_string() + DataType(large_string) + + and use the type to create an array: + + >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) + + [ + "foo", + "bar", + ... + "foo", + "bar" + ] + """ large_utf8 = large_string +""" +Alias for large_string(). + +Examples +-------- +Create an instance of large UTF8 variable-length binary type: + +>>> import pyarrow as pa +>>> pa.large_utf8() +DataType(large_string) + +and use the type to create an array: + +>>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) + +[ + "foo", + "bar", + ... + "foo", + "bar" +] +""" + +def binary_view() -> BinaryViewType: + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + +def string_view() -> StringViewType: + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ -def binary_view() -> BinaryViewType: ... -def string_view() -> StringViewType: ... @overload def list_( value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... @@ -448,17 +3425,199 @@ def list_( def list_( value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size ) -> FixedSizeListType[_DataTypeT, _Size]: ... -def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: ... -def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: ... +def list_(*args, **kwargs): + """ + Create ListType instance from child data type or field. + + Parameters + ---------- + value_type : DataType or Field + list_size : int, optional, default -1 + If length == -1 then return a variable length list type. If length is + greater than or equal to 0 then return a fixed size list type. + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + + Use the ListType to create a scalar: + + >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) + + + or an array: + + >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + +def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: + """ + Create LargeListType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.int8()) + LargeListType(large_list) + + Use the LargeListType to create an array: + + >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) + + [ + [ + -1, + 3 + ], + [ + -1, + 3 + ], + ... + """ + +def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + def large_list_view( value_type: _DataTypeT | Field[_DataTypeT], -) -> LargeListViewType[_DataTypeT]: ... +) -> LargeListViewType[_DataTypeT]: + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + @overload def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... @overload def map_( key_type: _K, item_type: _ValueT, key_sorted: _Ordered ) -> MapType[_K, _ValueT, _Ordered]: ... +def map_(*args, **kwargs): + """ + Create MapType instance from key and item data types or fields. + + Parameters + ---------- + key_type : DataType or Field + item_type : DataType or Field + keys_sorted : bool + + Returns + ------- + map_type : DataType + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + + Use MapType to create an array: + + >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] + >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) + + [ + keys: + [ + "a", + "b" + ] + values: + [ + 1, + 2 + ], + keys: + [ + "c" + ] + values: + [ + 3 + ] + ] + """ + @overload def dictionary( index_type: _IndexT, value_type: _BasicValueT @@ -467,16 +3626,153 @@ def dictionary( def dictionary( index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered ) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +def dictionary(*args, **kwargs): + """ + Dictionary (categorical, or simply encoded) type. + + Parameters + ---------- + index_type : DataType + value_type : DataType + ordered : bool + + Returns + ------- + type : DictionaryType + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + + Use dictionary type to create an array: + + >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) + + ... + -- dictionary: + [ + "a", + "b", + "d" + ] + -- indices: + [ + 0, + 1, + null, + 2 + ] + """ + def struct( fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] | Mapping[str, Field[Any]], -) -> StructType: ... +) -> StructType: + """ + Create StructType instance from fields. + + A struct is a nested type parameterized by an ordered sequence of types + (which can all be distinct), called its fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + + Examples + -------- + Create an instance of StructType from an iterable of tuples: + + >>> import pyarrow as pa + >>> fields = [ + ... ("f1", pa.int32()), + ... ("f2", pa.string()), + ... ] + >>> struct_type = pa.struct(fields) + >>> struct_type + StructType(struct) + + Retrieve a field from a StructType: + + >>> struct_type[0] + pyarrow.Field + >>> struct_type["f1"] + pyarrow.Field + + Create an instance of StructType from an iterable of Fields: + + >>> fields = [ + ... pa.field("f1", pa.int32()), + ... pa.field("f2", pa.string(), nullable=False), + ... ] + >>> pa.struct(fields) + StructType(struct) + + Returns + ------- + type : DataType + """ + def sparse_union( child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> SparseUnionType: ... +) -> SparseUnionType: + """ + Create SparseUnionType from child fields. + + A sparse union is a nested type where each logical value is taken from + a single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + In a sparse union, each child array should have the same length as the + union array, regardless of the actual number of union values that + refer to it. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : SparseUnionType + """ + def dense_union( child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> DenseUnionType: ... +) -> DenseUnionType: + """ + Create DenseUnionType from child fields. + + A dense union is a nested type where each logical value is taken from + a single child, at a specific offset. A buffer of 8-bit type ids + indicates which child a given logical value is to be taken from, + and a buffer of 32-bit offsets indicates at which physical position + in the given child array the logical value is to be taken from. + + Unlike a sparse union, a dense union allows encoding only the child array + values which are actually referred to by the union array. This is + counterbalanced by the additional footprint of the offsets buffer, and + the additional indirection cost when looking up values. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : DenseUnionType + """ + @overload def union( child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None @@ -485,19 +3781,244 @@ def union( def union( child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None ) -> DenseUnionType: ... +def union(*args, **kwargs): + """ + Create UnionType from child fields. + + A union is a nested type where each logical value is taken from a + single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + Unions come in two flavors: sparse and dense + (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + mode : str + Must be 'sparse' or 'dense' + type_codes : list of integers, default None + + Returns + ------- + type : UnionType + """ + def run_end_encoded( run_end_type: _RunEndType, value_type: _BasicValueT -) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... -def json_(storage_type: DataType = ...) -> JsonType: ... -def uuid() -> UuidType: ... +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: + """ + Create RunEndEncodedType from run-end and value types. + + Parameters + ---------- + run_end_type : pyarrow.DataType + The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. + value_type : pyarrow.DataType + The type of the values array. + + Returns + ------- + type : RunEndEncodedType + """ + +def json_(storage_type: DataType = ...) -> JsonType: + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + +def uuid() -> UuidType: + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + def fixed_shape_tensor( value_type: _ValueT, shape: Sequence[int], dim_names: Sequence[str] | None = None, permutation: Sequence[int] | None = None, -) -> FixedShapeTensorType[_ValueT]: ... -def bool8() -> Bool8Type: ... -def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... +) -> FixedShapeTensorType[_ValueT]: + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired logical + ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple or list of integers + The physical shape of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + +def bool8() -> Bool8Type: + """ + Create instance of bool8 extension type. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(int8) + + Create a table with a bool8 array: + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[-1,0,1,2,null]] + + Returns + ------- + type : Bool8Type + """ + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: + """ + Create instance of opaque extension type. + + Parameters + ---------- + storage_type : DataType + The underlying data type. + type_name : str + The name of the type in the external system. + vendor_name : str + The name of the external system. + + Examples + -------- + Create an instance of an opaque extension type: + + >>> import pyarrow as pa + >>> type = pa.opaque(pa.binary(), "other", "jdbc") + >>> type + OpaqueType(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(binary) + >>> type.type_name + 'other' + >>> type.vendor_name + 'jdbc' + + Create a table with an opaque array: + + >>> arr = [None, b"foobar"] + >>> storage = pa.array(arr, pa.binary()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[null,666F6F626172]] + + Returns + ------- + type : OpaqueType + """ + @overload def type_for_alias(name: Literal["null"]) -> NullType: ... @overload @@ -568,6 +4089,20 @@ def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]] def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... @overload def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def type_for_alias(name): + """ + Return DataType given a string alias if one exists. + + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + + Returns + ------- + type : DataType + """ + @overload def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... @overload @@ -645,11 +4180,112 @@ def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalT def schema( fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], metadata: dict[bytes | str, bytes | str] | None = None, -) -> Schema: ... -def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: ... -def is_boolean_value(obj: Any) -> bool: ... -def is_integer_value(obj: Any) -> bool: ... -def is_float_value(obj: Any) -> bool: ... +) -> Schema: + """ + Construct pyarrow.Schema from collection of fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Can also pass an object that implements the Arrow PyCapsule Protocol + for schemas (has an ``__arrow_c_schema__`` method). + metadata : dict, default None + Keys and values must be coercible to bytes. + + Examples + -------- + Create a Schema from iterable of tuples: + + >>> import pyarrow as pa + >>> pa.schema( + ... [ + ... ("some_int", pa.int32()), + ... ("some_string", pa.string()), + ... pa.field("some_required_string", pa.string(), nullable=False), + ... ] + ... ) + some_int: int32 + some_string: string + some_required_string: string not null + + Create a Schema from iterable of Fields: + + >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) + some_int: int32 + some_string: string + + DataTypes can also be passed as strings. The following is equivalent to the + above example: + + >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) + some_int: int32 + some_string: string + + Or more concisely: + + >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) + some_int: int32 + some_string: string + + Returns + ------- + schema : pyarrow.Schema + """ + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: + """ + Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert + + + Examples + -------- + Create a pyarrow DataType from NumPy dtype: + + >>> import pyarrow as pa + >>> import numpy as np + >>> pa.from_numpy_dtype(np.dtype("float16")) + DataType(halffloat) + >>> pa.from_numpy_dtype("U") + DataType(string) + >>> pa.from_numpy_dtype(bool) + DataType(bool) + >>> pa.from_numpy_dtype(np.str_) + DataType(string) + """ + +def is_boolean_value(obj: Any) -> bool: + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_integer_value(obj: Any) -> bool: + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_float_value(obj: Any) -> bool: + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ __all__ = [ "_Weakrefable", diff --git a/pyarrow-stubs/_azurefs.pyi b/pyarrow-stubs/_azurefs.pyi index ee4529e8f01..317943ce20f 100644 --- a/pyarrow-stubs/_azurefs.pyi +++ b/pyarrow-stubs/_azurefs.pyi @@ -3,6 +3,65 @@ from typing import Literal from ._fs import FileSystem class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. If sas_token and account_key are None the + default credential will be used. The parameters account_key and sas_token are + mutually exclusive. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + sas_token : str, default None + SAS token for the storage account, used as an alternative to account_key. If sas_token + and account_key are None the default credential will be used. The parameters + account_key and sas_token are mutually exclusive. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") + >>> azurite_fs = fs.AzureFileSystem( + ... account_name="devstoreaccount1", + ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ... blob_storage_authority="127.0.0.1:10000", + ... dfs_storage_authority="127.0.0.1:10000", + ... blob_storage_scheme="http", + ... dfs_storage_scheme="http", + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__( self, account_name: str, diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi index dce71343ef4..3d61ae42787 100644 --- a/pyarrow-stubs/_compute.pyi +++ b/pyarrow-stubs/_compute.pyi @@ -14,26 +14,91 @@ from . import lib _Order: TypeAlias = Literal["ascending", "descending"] _Placement: TypeAlias = Literal["at_start", "at_end"] -class Kernel(lib._Weakrefable): ... +class Kernel(lib._Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ class Function(lib._Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ @property - def arity(self) -> int: ... + def arity(self) -> int: + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ @property def kind( self, - ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: + """ + The function kind. + """ @property - def name(self) -> str: ... + def name(self) -> str: + """ + The function name. + """ @property - def num_kernels(self) -> int: ... + def num_kernels(self) -> int: + """ + The number of kernels implementing this function. + """ def call( self, args: Iterable, options: FunctionOptions | None = None, memory_pool: lib.MemoryPool | None = None, length: int | None = None, - ) -> Any: ... + ) -> Any: + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ class FunctionOptions(lib._Weakrefable): def serialize(self) -> lib.Buffer: ... @@ -41,8 +106,20 @@ class FunctionOptions(lib._Weakrefable): def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... class FunctionRegistry(lib._Weakrefable): - def get_function(self, name: str) -> Function: ... - def list_functions(self) -> list[str]: ... + def get_function(self, name: str) -> Function: + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + + def list_functions(self) -> list[str]: + """ + Return all function names in the registry. + """ class HashAggregateFunction(Function): ... class HashAggregateKernel(Kernel): ... @@ -55,6 +132,18 @@ class VectorKernel(Kernel): ... # ==================== _compute.pyx Option classes ==================== class ArraySortOptions(FunctionOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ def __init__( self, order: _Order = "ascending", @@ -62,6 +151,21 @@ class ArraySortOptions(FunctionOptions): ) -> None: ... class AssumeTimezoneOptions(FunctionOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + def __init__( self, timezone: str, @@ -71,6 +175,27 @@ class AssumeTimezoneOptions(FunctionOptions): ) -> None: ... class CastOptions(FunctionOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + allow_int_overflow: bool allow_time_truncate: bool allow_time_overflow: bool @@ -96,46 +221,189 @@ class CastOptions(FunctionOptions): def is_safe(self) -> bool: ... class CountOptions(FunctionOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... class CumulativeOptions(FunctionOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... class CumulativeSumOptions(FunctionOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... class DayOfWeekOptions(FunctionOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... class DictionaryEncodeOptions(FunctionOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... class RunEndEncodeOptions(FunctionOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ # TODO: default is DataType(int32) def __init__(self, run_end_type: lib.DataType = ...) -> None: ... class ElementWiseAggregateOptions(FunctionOptions): + """ + Options for element-wise aggregate functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + """ def __init__(self, *, skip_nulls: bool = True) -> None: ... class ExtractRegexOptions(FunctionOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ def __init__(self, pattern: str) -> None: ... class ExtractRegexSpanOptions(FunctionOptions): + """ + Options for the `extract_regex_span` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ def __init__(self, pattern: str) -> None: ... class FilterOptions(FunctionOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... class IndexOptions(FunctionOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ def __init__(self, value: lib.Scalar) -> None: ... class JoinOptions(FunctionOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ @overload def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... @overload def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... class ListSliceOptions(FunctionOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ def __init__( self, start: int, @@ -145,9 +413,31 @@ class ListSliceOptions(FunctionOptions): ) -> None: ... class ListFlattenOptions(FunctionOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ def __init__(self, recursive: bool = False) -> None: ... class MakeStructOptions(FunctionOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ def __init__( self, field_names: Sequence[str] = (), @@ -157,35 +447,146 @@ class MakeStructOptions(FunctionOptions): ) -> None: ... class MapLookupOptions(FunctionOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ # TODO: query_key: Scalar or Object can be converted to Scalar def __init__( self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] ) -> None: ... class MatchSubstringOptions(FunctionOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... class ModeOptions(FunctionOptions): + """ + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... class NullOptions(FunctionOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ def __init__(self, *, nan_is_null: bool = False) -> None: ... class PadOptions(FunctionOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ def __init__( self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True ) -> None: ... class PairwiseOptions(FunctionOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ def __init__(self, period: int = 1) -> None: ... class PartitionNthOptions(FunctionOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... class WinsorizeOptions(FunctionOptions): + """ + Options for the `winsorize` function. + + Parameters + ---------- + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile's value. + """ def __init__(self, lower_limit: float, upper_limit: float) -> None: ... class QuantileOptions(FunctionOptions): + """ + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__( self, q: float | Sequence[float], @@ -196,9 +597,48 @@ class QuantileOptions(FunctionOptions): ) -> None: ... class RandomOptions(FunctionOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... class RankOptions(FunctionOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ def __init__( self, sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", @@ -208,6 +648,23 @@ class RankOptions(FunctionOptions): ) -> None: ... class RankQuantileOptions(FunctionOptions): + """ + Options for the `rank_quantile` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + """ + def __init__( self, sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", @@ -216,6 +673,21 @@ class RankQuantileOptions(FunctionOptions): ) -> None: ... class PivotWiderOptions(FunctionOptions): + """ + Options for the `pivot_wider` function. + + Parameters + ---------- + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + """ def __init__( self, key_names: Sequence[str], @@ -224,9 +696,34 @@ class PivotWiderOptions(FunctionOptions): ) -> None: ... class ReplaceSliceOptions(FunctionOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ def __init__(self, start: int, stop: int, replacement: str) -> None: ... class ReplaceSubstringOptions(FunctionOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ def __init__( self, pattern: str, replacement: str, *, max_replacements: int | None = None ) -> None: ... @@ -245,12 +742,36 @@ _RoundMode: TypeAlias = Literal[ ] class RoundBinaryOptions(FunctionOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ def __init__( self, round_mode: _RoundMode = "half_to_even", ) -> None: ... class RoundOptions(FunctionOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ def __init__( self, ndigits: int = 0, @@ -272,6 +793,47 @@ _DateTimeUint: TypeAlias = Literal[ ] class RoundTemporalOptions(FunctionOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + """ def __init__( self, multiple: int = 1, @@ -283,50 +845,222 @@ class RoundTemporalOptions(FunctionOptions): ) -> None: ... class RoundToMultipleOptions(FunctionOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... class ScalarAggregateOptions(FunctionOptions): + """ + Options for scalar aggregations. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... class SelectKOptions(FunctionOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... class SetLookupOptions(FunctionOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... class SliceOptions(FunctionOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... class SortOptions(FunctionOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ def __init__( self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" ) -> None: ... class SplitOptions(FunctionOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... class SplitPatternOptions(FunctionOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ def __init__( self, pattern: str, *, max_splits: int | None = None, reverse: bool = False ) -> None: ... class StrftimeOptions(FunctionOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... class StrptimeOptions(FunctionOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ def __init__( self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False ) -> None: ... class StructFieldOptions(FunctionOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ def __init__( self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int ) -> None: ... class TakeOptions(FunctionOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ def __init__(self, boundscheck: bool = True) -> None: ... class TDigestOptions(FunctionOptions): + """ + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__( self, q: float | Sequence[float] = 0.5, @@ -338,20 +1072,84 @@ class TDigestOptions(FunctionOptions): ) -> None: ... class TrimOptions(FunctionOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ def __init__(self, characters: str) -> None: ... class Utf8NormalizeOptions(FunctionOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... class VarianceOptions(FunctionOptions): + """ + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... class SkewOptions(FunctionOptions): + """ + Options for the `skew` and `kurtosis` functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ def __init__( self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 ) -> None: ... class WeekOptions(FunctionOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ def __init__( self, *, @@ -368,16 +1166,65 @@ def call_function( options: FunctionOptions | None = None, memory_pool: lib.MemoryPool | None = None, length: int | None = None, -) -> Any: ... +) -> Any: + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + def function_registry() -> FunctionRegistry: ... -def get_function(name: str) -> Function: ... -def list_functions() -> list[str]: ... +def get_function(name: str) -> Function: + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + +def list_functions() -> list[str]: + """ + Return all function names in the global registry. + """ # ==================== _compute.pyx Udf ==================== def call_tabular_function( function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None -) -> lib.RecordBatchReader: ... +) -> lib.RecordBatchReader: + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ class _FunctionDoc(TypedDict): summary: str @@ -390,7 +1237,81 @@ def register_scalar_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -) -> None: ... +) -> None: + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + def register_tabular_function( func: Callable, function_name: str, @@ -398,7 +1319,40 @@ def register_tabular_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -): ... +) -> None: + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + def register_aggregate_function( func: Callable, function_name: str, @@ -406,7 +1360,89 @@ def register_aggregate_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -): ... +) -> None: + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) + >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + def register_vector_function( func: Callable, function_name: str, @@ -414,21 +1450,182 @@ def register_vector_function( in_types: dict[str, lib.DataType], out_type: lib.DataType, func_registry: FunctionRegistry | None = None, -): ... +) -> None: + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + @property - def batch_length(self) -> int: ... + def batch_length(self) -> int: + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ @property - def memory_pool(self) -> lib.MemoryPool: ... + def memory_pool(self) -> lib.MemoryPool: + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ # ==================== _compute.pyx Expression ==================== class Expression(lib._Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field("a") != 3 + + >>> pc.field("a").isin([1, 2, 3]) + + """ + @staticmethod - def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... - def to_substrait( - self, schema: lib.Schema, allow_arrow_extensions: bool = False - ) -> lib.Buffer: ... + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + message : bytes or Buffer or a protobuf Message + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ def __invert__(self) -> Expression: ... def __and__(self, other) -> Expression: ... def __or__(self, other) -> Expression: ... @@ -442,12 +1639,83 @@ class Expression(lib._Weakrefable): def __ge__(self, value: object) -> Expression: ... # type: ignore[override] def __le__(self, value: object) -> Expression: ... # type: ignore[override] def __truediv__(self, other) -> Expression: ... - def is_valid(self) -> bool: ... - def is_null(self, nan_is_null: bool = False) -> Expression: ... - def is_nan(self) -> Expression: ... + def is_valid(self) -> bool: + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + def is_null(self, nan_is_null: bool = False) -> Expression: + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + def is_nan(self) -> Expression: + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ def cast( self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None - ) -> Expression: ... - def isin(self, values: lib.Array | Iterable) -> Expression: ... + ) -> Expression: + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + def isin(self, values: lib.Array | Iterable) -> Expression: + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ # ==================== _compute.py ==================== diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi index 45c8dcf5485..2f49f8c9a6c 100644 --- a/pyarrow-stubs/_csv.pyi +++ b/pyarrow-stubs/_csv.pyi @@ -7,6 +7,95 @@ from . import lib @dataclass(kw_only=True) class ReadOptions(lib._Weakrefable): + """ + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] + """ + use_threads: bool = field(default=True, kw_only=False) block_size: int | None = None skip_rows: int = 0 @@ -19,6 +108,80 @@ class ReadOptions(lib._Weakrefable): @dataclass(kw_only=True) class ParseOptions(lib._Weakrefable): + """ + Options for parsing CSV files. + + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return "skip" + ... else: + ... return "error" + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + delimiter: str = field(default=",", kw_only=False) quote_char: str | Literal[False] = '"' double_quote: bool = True @@ -31,6 +194,209 @@ class ParseOptions(lib._Weakrefable): @dataclass(kw_only=True) class ConvertOptions(lib._Weakrefable): + """ + Options for converting CSV data. + + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"], strings_can_be_null=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ + check_utf8: bool = field(default=True, kw_only=False) column_types: lib.Schema | dict | None = None null_values: list[str] | None = None @@ -49,6 +415,29 @@ class ConvertOptions(lib._Weakrefable): @dataclass(kw_only=True) class WriteOptions(lib._Weakrefable): + """ + Options for writing CSV files. + + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + include_header: bool = field(default=True, kw_only=False) batch_size: int = 1024 delimiter: str = "," @@ -58,12 +447,42 @@ class WriteOptions(lib._Weakrefable): @dataclass class InvalidRow(lib._Weakrefable): + """ + Description of an invalid row in a CSV file. + + Parameters + ---------- + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ + expected_columns: int actual_columns: int number: int | None text: str class CSVWriter(lib._CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + """ + def __init__( self, # TODO: OutputStream @@ -84,17 +503,139 @@ def open_csv( parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> CSVStreamingReader: ... +) -> CSVStreamingReader: + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.csv.CSVStreamingReader` + """ + def read_csv( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, convert_options: ConvertOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Table: ... +) -> lib.Table: + """ + Read a Table from a stream of CSV data. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + def write_csv( data: lib.RecordBatch | lib.Table, output_file: StrPath | lib.NativeFile | IO[Any], write_options: WriteOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> None: ... +) -> None: + """ + Write record batch or table to a CSV file. + + Parameters + ---------- + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + """ diff --git a/pyarrow-stubs/_cuda.pyi b/pyarrow-stubs/_cuda.pyi index c7533b6621d..ad52b2f380f 100644 --- a/pyarrow-stubs/_cuda.pyi +++ b/pyarrow-stubs/_cuda.pyi @@ -8,46 +8,277 @@ from . import lib from ._stubs_typing import ArrayLike class Context(lib._Weakrefable): - def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ @staticmethod - def from_numba(context: _numba_driver.Context | None = None) -> Context: ... - def to_numba(self) -> _numba_driver.Context: ... + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ @staticmethod - def get_num_devices() -> int: ... + def get_num_devices() -> int: + """Return the number of GPU devices.""" @property - def device_number(self) -> int: ... + def device_number(self) -> int: + """Return context device number.""" @property - def handle(self) -> int: ... - def synchronize(self) -> None: ... + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ @property - def bytes_allocated(self) -> int: ... - def get_device_address(self, address: int) -> int: ... - def new_buffer(self, nbytes: int) -> CudaBuffer: ... + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ @property - def memory_manager(self) -> lib.MemoryManager: ... + def memory_manager(self) -> lib.MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ @property - def device(self) -> lib.Device: ... - def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: ... - def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + def device(self) -> lib.Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ def buffer_from_data( self, data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, offset: int = 0, size: int = -1, - ) -> CudaBuffer: ... - def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ class IpcMemHandle(lib._Weakrefable): + """A serializable container for a CUDA IPC handle.""" @staticmethod - def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... - def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ class CudaBuffer(lib.Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + @staticmethod - def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + def from_buffer(buf: lib.Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ @staticmethod - def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... - def to_numba(self) -> _numba_driver.MemoryPointer: ... + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" def copy_to_host( self, position: int = 0, @@ -55,44 +286,271 @@ class CudaBuffer(lib.Buffer): buf: lib.Buffer | None = None, memory_pool: lib.MemoryPool | None = None, resizable: bool = False, - ) -> lib.Buffer: ... + ) -> lib.Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ def copy_from_host( self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 - ) -> int: ... - def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: ... - def export_for_ipc(self) -> IpcMemHandle: ... + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ @property - def context(self) -> Context: ... - def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... - def to_pybytes(self) -> bytes: ... + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" class HostBuffer(lib.Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ @property def size(self) -> int: ... class BufferReader(lib.NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ def __init__(self, obj: CudaBuffer) -> None: ... - def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ class BufferWriter(lib.NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ def __init__(self, obj: CudaBuffer) -> None: ... - def writeat(self, position: int, data: ArrayLike) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ @property - def buffer_size(self) -> int: ... + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" @buffer_size.setter - def buffer_size(self, buffer_size: int): ... + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ @property - def num_bytes_buffered(self) -> int: ... + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ -def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... -def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... def read_message( source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None -) -> lib.Message: ... +) -> lib.Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + def read_record_batch( buffer: lib.Buffer, object: lib.Schema, *, dictionary_memo: lib.DictionaryMemo | None = None, pool: lib.MemoryPool | None = None, -) -> lib.RecordBatch: ... +) -> lib.RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index 593c0abc64b..af864f9154b 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -26,10 +26,47 @@ from .compute import Expression from .ipc import IpcWriteOptions, RecordBatchReader class Dataset(lib._Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + @property - def partition_expression(self) -> Expression: ... - def replace_schema(self, schema: lib.Schema) -> None: ... - def get_fragments(self, filter: Expression | None = None): ... + def partition_expression(self) -> Expression: + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + def replace_schema(self, schema: lib.Schema) -> None: + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + def get_fragments(self, filter: Expression | None = None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ def scanner( self, columns: list[str] | None = None, @@ -41,7 +78,122 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... + ) -> Scanner: + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner( + ... columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... } + ... ).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ def to_batches( self, columns: list[str] | None = None, @@ -53,7 +205,65 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: ... + ) -> Iterator[lib.RecordBatch]: + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ def to_table( self, columns: list[str] | dict[str, Expression] | None = None, @@ -65,7 +275,68 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ def take( self, indices: Indices, @@ -78,7 +349,67 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ def head( self, num_rows: int, @@ -91,7 +422,67 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ def count_rows( self, columns: list[str] | None = None, @@ -103,11 +494,82 @@ class Dataset(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> int: ... + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ @property - def schema(self) -> lib.Schema: ... - def filter(self, expression: Expression) -> Self: ... - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + def schema(self) -> lib.Schema: + """The common schema of the full Dataset""" + def filter(self, expression: Expression) -> Self: + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ def join( self, right_dataset: Dataset, @@ -118,7 +580,45 @@ class Dataset(lib._Weakrefable): right_suffix: str | None = None, coalesce_keys: bool = True, use_threads: bool = True, - ) -> InMemoryDataset: ... + ) -> InMemoryDataset: + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ def join_asof( self, right_dataset: Dataset, @@ -127,15 +627,115 @@ class Dataset(lib._Weakrefable): tolerance: int, right_on: str | list[str] | None = None, right_by: str | list[str] | None = None, - ) -> InMemoryDataset: ... + ) -> InMemoryDataset: + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + +class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. -class InMemoryDataset(Dataset): ... + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + @property def children(self) -> list[Dataset]: ... class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + def __init__( self, fragments: list[Fragment], @@ -153,15 +753,44 @@ class FileSystemDataset(Dataset): filesystem: SupportedFileSystem | None = None, partitions: list[Expression] | None = None, root_partition: Expression | None = None, - ) -> FileSystemDataset: ... + ) -> FileSystemDataset: + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ @property def filesystem(self) -> FileSystem: ... @property - def partitioning(self) -> Partitioning | None: ... + def partitioning(self) -> Partitioning | None: + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ @property - def files(self) -> list[str]: ... + def files(self) -> list[str]: + """List of the files""" @property - def format(self) -> FileFormat: ... + def format(self) -> FileFormat: + """The FileFormat of this source.""" class FileWriteOptions(lib._Weakrefable): @property @@ -170,7 +799,23 @@ class FileWriteOptions(lib._Weakrefable): class FileFormat(lib._Weakrefable): def inspect( self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None - ) -> lib.Schema: ... + ) -> lib.Schema: + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ def make_fragment( self, file: StrPath | IO, @@ -178,7 +823,29 @@ class FileFormat(lib._Weakrefable): partition_expression: Expression | None = None, *, file_size: int | None = None, - ) -> Fragment: ... + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ def make_write_options(self) -> FileWriteOptions: ... @property def default_extname(self) -> str: ... @@ -188,10 +855,16 @@ class FileFormat(lib._Weakrefable): def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... class Fragment(lib._Weakrefable): + """Fragment of data from a Dataset.""" @property - def physical_schema(self) -> lib.Schema: ... + def physical_schema(self) -> lib.Schema: + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" @property - def partition_expression(self) -> Expression: ... + def partition_expression(self) -> Expression: + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ def scanner( self, schema: lib.Schema | None = None, @@ -204,7 +877,73 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... + ) -> Scanner: + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ def to_batches( self, columns: list[str] | None = None, @@ -216,7 +955,67 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: ... + ) -> Iterator[lib.RecordBatch]: + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ def to_table( self, columns: list[str] | None = None, @@ -228,7 +1027,70 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ def take( self, indices: Indices, @@ -241,7 +1103,67 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ def head( self, num_rows: int, @@ -254,7 +1176,67 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: ... + ) -> lib.Table: + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ def count_rows( self, columns: list[str] | None = None, @@ -266,20 +1248,81 @@ class Fragment(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> int: ... + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ class FileFragment(Fragment): - def open(self) -> lib.NativeFile: ... + """A Fragment representing a data file.""" + + def open(self) -> lib.NativeFile: + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ @property - def path(self) -> str: ... + def path(self) -> str: + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ @property - def filesystem(self) -> FileSystem: ... + def filesystem(self) -> FileSystem: + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ @property - def buffer(self) -> lib.Buffer: ... + def buffer(self) -> lib.Buffer: + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ @property - def format(self) -> FileFormat: ... + def format(self) -> FileFormat: + """ + The format of the data file viewed by this fragment. + """ class FragmentScanOptions(lib._Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + @property def type_name(self) -> str: ... @@ -298,6 +1341,20 @@ class IpcFileFormat(FileFormat): class FeatherFileFormat(IpcFileFormat): ... class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ def __init__( self, parse_options: _csv.ParseOptions | None = None, @@ -313,6 +1370,17 @@ class CsvFileFormat(FileFormat): def equals(self, other: CsvFileFormat) -> bool: ... class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + convert_options: _csv.ConvertOptions read_options: _csv.ReadOptions @@ -325,6 +1393,18 @@ class CsvFileWriteOptions(FileWriteOptions): write_options: _csv.WriteOptions class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ def __init__( self, default_fragment_scan_options: JsonFragmentScanOptions | None = None, @@ -334,6 +1414,17 @@ class JsonFileFormat(FileFormat): def equals(self, other: JsonFileFormat) -> bool: ... class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + parse_options: _json.ParseOptions read_options: _json.ReadOptions def __init__( @@ -342,10 +1433,46 @@ class JsonFragmentScanOptions(FragmentScanOptions): def equals(self, other: JsonFragmentScanOptions) -> bool: ... class Partitioning(lib._Weakrefable): - def parse(self, path: str) -> Expression: ... - def format(self, expr: Expression) -> tuple[str, str]: ... + def parse(self, path: str) -> Expression: + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + def format(self, expr: Expression) -> tuple[str, str]: + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) + >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) + ('1862/Jan', '') + """ @property - def schema(self) -> lib.Schema: ... + def schema(self) -> lib.Schema: + """The arrow Schema attached to the partitioning.""" class PartitioningFactory(lib._Weakrefable): @property @@ -353,9 +1480,52 @@ class PartitioningFactory(lib._Weakrefable): class KeyValuePartitioning(Partitioning): @property - def dictionaries(self) -> list[lib.Array | None]: ... + def dictionaries(self) -> list[lib.Array | None]: + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + @staticmethod def discover( field_names: list[str] | None = None, @@ -363,7 +1533,38 @@ class DirectoryPartitioning(KeyValuePartitioning): max_partition_dictionary_size: int = 0, schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: ... + ) -> PartitioningFactory: + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ def __init__( self, schema: lib.Schema, @@ -372,6 +1573,46 @@ class DirectoryPartitioning(KeyValuePartitioning): ) -> None: ... class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ def __init__( self, schema: lib.Schema, @@ -386,9 +1627,76 @@ class HivePartitioning(KeyValuePartitioning): null_fallback="__HIVE_DEFAULT_PARTITION__", schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: ... + ) -> PartitioningFactory: + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + def __init__( self, schema: lib.Schema, @@ -401,15 +1709,94 @@ class FilenamePartitioning(KeyValuePartitioning): infer_dictionary: bool = False, schema: lib.Schema | None = None, segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: ... + ) -> PartitioningFactory: + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ class DatasetFactory(lib._Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + root_partition: Expression - def finish(self, schema: lib.Schema | None = None) -> Dataset: ... - def inspect(self) -> lib.Schema: ... + def finish(self, schema: lib.Schema | None = None) -> Dataset: + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + def inspect(self) -> lib.Schema: + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ def inspect_schemas(self) -> list[lib.Schema]: ... class FileSystemFactoryOptions(lib._Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + partitioning: Partitioning partitioning_factory: PartitioningFactory partition_base_dir: str @@ -425,6 +1812,21 @@ class FileSystemFactoryOptions(lib._Weakrefable): ) -> None: ... class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + def __init__( self, filesystem: SupportedFileSystem, @@ -434,23 +1836,49 @@ class FileSystemDatasetFactory(DatasetFactory): ) -> None: ... class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ def __init__(self, factories: list[DatasetFactory]) -> None: ... _RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + """An iterator over a sequence of record batches.""" def __iter__(self) -> Self: ... def __next__(self) -> _RecordBatchT: ... class TaggedRecordBatch(NamedTuple): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + record_batch: lib.RecordBatch fragment: Fragment class TaggedRecordBatchIterator(lib._Weakrefable): + """An iterator over a sequence of record batches with fragments.""" def __iter__(self) -> Self: ... def __next__(self) -> TaggedRecordBatch: ... class Scanner(lib._Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ @staticmethod def from_dataset( dataset: Dataset, @@ -464,7 +1892,63 @@ class Scanner(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... + ) -> Scanner: + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ @staticmethod def from_fragment( fragment: Fragment, @@ -479,7 +1963,65 @@ class Scanner(lib._Weakrefable): use_threads: bool = True, cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... + ) -> Scanner: + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ @overload @staticmethod def from_batches( @@ -511,21 +2053,196 @@ class Scanner(lib._Weakrefable): cache_metadata: bool = True, memory_pool: lib.MemoryPool | None = None, ) -> Scanner: ... + @staticmethod + def from_batches(*args, **kwargs): + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. + schema : Schema + The schema of the batches (required when passing a Python + iterator). + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ @property - def dataset_schema(self) -> lib.Schema: ... + def dataset_schema(self) -> lib.Schema: + """The schema with which batches will be read from fragments.""" @property - def projected_schema(self) -> lib.Schema: ... - def to_batches(self) -> Iterator[lib.RecordBatch]: ... - def scan_batches(self) -> TaggedRecordBatchIterator: ... - def to_table(self) -> lib.Table: ... - def take(self, indices: Indices) -> lib.Table: ... - def head(self, num_rows: int) -> lib.Table: ... - def count_rows(self) -> int: ... - def to_reader(self) -> RecordBatchReader: ... + def projected_schema(self) -> lib.Schema: + """ + The materialized schema of the data, accounting for projections. -def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + This is the schema of any data returned from the scanner. + """ + def to_batches(self) -> Iterator[lib.RecordBatch]: + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def scan_batches(self) -> TaggedRecordBatchIterator: + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + def to_table(self) -> lib.Table: + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + def take(self, indices: Indices) -> lib.Table: + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + def head(self, num_rows: int) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + def count_rows(self) -> int: + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + def to_reader(self) -> RecordBatchReader: + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ class WrittenFile(lib._Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... def _filesystemdataset_write( @@ -549,6 +2266,34 @@ class _ScanNodeOptions(ExecNodeOptions): def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices when + implicit_ordering=True to enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + require_sequenced_output : bool, default False + Batches are yielded sequentially, like single-threaded + implicit_ordering : bool, default False + Preserve implicit ordering of data. + """ + def __init__( self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs ) -> None: ... diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index 2814fa8ed6f..2e9edda57f7 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -22,6 +22,18 @@ from .lib import CacheOptions, Schema, _Weakrefable parquet_encryption_enabled: bool class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ def __init__( self, read_options: ParquetReadOptions, @@ -42,13 +54,50 @@ class ParquetFileFormat(FileFormat): row_groups: Iterable[int] | None = None, *, file_size: int | None = None, - ) -> Fragment: ... + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ class _NameStats(TypedDict): min: Any max: Any class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + id: int metadata: FileMetaData schema: Schema @@ -62,24 +111,87 @@ class RowGroupInfo: def statistics(self) -> dict[str, _NameStats]: ... class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + def ensure_complete_metadata(self) -> None: ... @property def row_groups(self) -> list[RowGroupInfo]: ... @property def metadata(self) -> FileMetaData: ... @property - def num_row_groups(self) -> int: ... + def num_row_groups(self) -> int: + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ def split_by_row_group( self, filter: Expression | None = None, schema: Schema | None = None - ) -> list[Fragment]: ... + ) -> list[Fragment]: + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ def subset( self, filter: Expression | None = None, schema: Schema | None = None, row_group_ids: list[int] | None = None, - ) -> ParquetFileFormat: ... + ) -> ParquetFileFormat: + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ def __init__( self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None ) -> None: ... @@ -97,6 +209,46 @@ class ParquetFileWriteOptions(FileWriteOptions): @dataclass(kw_only=True) class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + use_buffered_stream: bool = False buffer_size: int = 8192 pre_buffer: bool = True @@ -111,11 +263,48 @@ class ParquetFragmentScanOptions(FragmentScanOptions): @dataclass class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + partition_base_dir: str | None = None partitioning: Partitioning | PartitioningFactory | None = None validate_column_chunk_paths: bool = False class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ def __init__( self, metadata_path: str, diff --git a/pyarrow-stubs/_dataset_parquet_encryption.pyi b/pyarrow-stubs/_dataset_parquet_encryption.pyi index 2072333daf1..7623275b865 100644 --- a/pyarrow-stubs/_dataset_parquet_encryption.pyi +++ b/pyarrow-stubs/_dataset_parquet_encryption.pyi @@ -4,6 +4,32 @@ from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConn from .lib import _Weakrefable class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ def __init__( self, crypto_factory: CryptoFactory, @@ -12,6 +38,32 @@ class ParquetEncryptionConfig(_Weakrefable): ) -> None: ... class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ def __init__( self, crypto_factory: CryptoFactory, diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi index 74b561ca3db..4450c42df49 100644 --- a/pyarrow-stubs/_flight.pyi +++ b/pyarrow-stubs/_flight.pyi @@ -30,61 +30,176 @@ from .lib import ( _T = TypeVar("_T") class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + def __init__( self, timeout: float | None = None, write_options: IpcWriteOptions | None = None, headers: list[tuple[str, str]] | None = None, read_options: IpcReadOptions | None = None, - ) -> None: ... + ) -> None: + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ class CertKeyPair(NamedTuple): + """A TLS certificate and key for use in Flight.""" + cert: str key: str class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + extra_info: bytes -class FlightInternalError(FlightError, ArrowException): ... -class FlightTimedOutError(FlightError, ArrowException): ... -class FlightCancelledError(FlightError, ArrowCancelled): ... -class FlightServerError(FlightError, ArrowException): ... -class FlightUnauthenticatedError(FlightError, ArrowException): ... -class FlightUnauthorizedError(FlightError, ArrowException): ... -class FlightUnavailableError(FlightError, ArrowException): ... +class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + +class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + +class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + +class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + +class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + +class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + +class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + limit: int actual: int class Action(_Weakrefable): - def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: ... + """An action executable on a Flight service.""" + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ @property - def type(self) -> str: ... + def type(self) -> str: + """The action type.""" @property - def body(self) -> Buffer: ... - def serialize(self) -> bytes: ... + def body(self) -> Buffer: + """The action body (arguments for the action).""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ class ActionType(NamedTuple): + """A type of action that is executable on a Flight service.""" + type: str description: str - def make_action(self, buf: Buffer | bytes) -> Action: ... + def make_action(self, buf: Buffer | bytes) -> Action: + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ class Result(_Weakrefable): - def __init__(self, buf: Buffer | bytes) -> None: ... + """A result from executing an Action.""" + def __init__(self, buf: Buffer | bytes) -> None: + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ @property - def body(self) -> Buffer: ... - def serialize(self) -> bytes: ... + def body(self) -> Buffer: + """Get the Buffer containing the result.""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ class BasicAuth(_Weakrefable): + """A container for basic auth.""" def __init__( self, username: str | bytes | None = None, password: str | bytes | None = None - ) -> None: ... + ) -> None: + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ @property def username(self) -> bytes: ... @property @@ -94,11 +209,30 @@ class BasicAuth(_Weakrefable): def deserialize(serialized: str | bytes) -> BasicAuth: ... class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + UNKNOWN = 0 PATH = 1 CMD = 2 class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + INVALID = 0 HANDSHAKE = 1 LIST_FLIGHTS = 2 @@ -111,21 +245,29 @@ class FlightMethod(enum.Enum): DO_EXCHANGE = 9 class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" @staticmethod - def for_path(*path: str | bytes) -> FlightDescriptor: ... + def for_path(*path: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for a resource path.""" + @staticmethod - def for_command(command: str | bytes) -> FlightDescriptor: ... + def for_command(command: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for an opaque command.""" @property - def descriptor_type(self) -> DescriptorType: ... + def descriptor_type(self) -> DescriptorType: + """Get the type of this descriptor.""" @property - def path(self) -> list[bytes] | None: ... + def path(self) -> list[bytes] | None: + """Get the path for this descriptor.""" @property - def command(self) -> bytes | None: ... + def command(self) -> bytes | None: + """Get the command for this descriptor.""" def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" def __init__(self, ticket: str | bytes) -> None: ... @property def ticket(self) -> bytes: ... @@ -134,46 +276,90 @@ class Ticket(_Weakrefable): def deserialize(cls, serialized: bytes) -> Self: ... class Location(_Weakrefable): + """The location of a Flight service.""" def __init__(self, uri: str | bytes) -> None: ... @property def uri(self) -> bytes: ... def equals(self, other: Location) -> bool: ... @staticmethod - def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... + def for_grpc_tcp(host: str | bytes, port: int) -> Location: + """Create a Location for a TCP-based gRPC service.""" @staticmethod - def for_grpc_tls(host: str | bytes, port: int) -> Location: ... + def for_grpc_tls(host: str | bytes, port: int) -> Location: + """Create a Location for a TLS-based gRPC service.""" @staticmethod - def for_grpc_unix(path: str | bytes) -> Location: ... + def for_grpc_unix(path: str | bytes) -> Location: + """Create a Location for a domain socket-based gRPC service.""" class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" def __init__( self, ticket: Ticket | str | bytes, locations: list[str | Location], expiration_time: TimestampScalar | None = ..., app_metadata: bytes | str = ..., - ): ... + ): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + expiration_time : TimestampScalar, default None + Expiration time of this stream. If present, clients may assume + they can retry DoGet requests. Otherwise, clients should avoid + retrying DoGet requests. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ @property - def ticket(self) -> Ticket: ... + def ticket(self) -> Ticket: + """Get the ticket in this endpoint.""" @property - def locations(self) -> list[Location]: ... + def locations(self) -> list[Location]: + """Get locations where this flight is available.""" def serialize(self) -> bytes: ... @property - def expiration_time(self) -> TimestampScalar | None: ... + def expiration_time(self) -> TimestampScalar | None: + """Get the expiration time of this stream. + + If present, clients may assume they can retry DoGet requests. + Otherwise, clients should avoid retrying DoGet requests. + + """ @property - def app_metadata(self) -> bytes | str: ... + def app_metadata(self) -> bytes | str: + """Get application-defined opaque metadata.""" @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class SchemaResult(_Weakrefable): - def __init__(self, schema: Schema) -> None: ... + """The serialized schema returned from a GetSchema request.""" + def __init__(self, schema: Schema) -> None: + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ @property - def schema(self) -> Schema: ... + def schema(self) -> Schema: + """The schema of the data in this flight.""" def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" def __init__( self, schema: Schema, @@ -183,26 +369,62 @@ class FlightInfo(_Weakrefable): total_bytes: int = ..., ordered: bool = ..., app_metadata: bytes | str = ..., - ) -> None: ... + ) -> None: + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int, default None + the total records in this flight, -1 or None if unknown. + total_bytes : int, default None + the total bytes in this flight, -1 or None if unknown. + ordered : boolean, default False + Whether endpoints are in the same order as the data. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + """ @property - def schema(self) -> Schema: ... + def schema(self) -> Schema: + """The schema of the data in this flight.""" @property - def descriptor(self) -> FlightDescriptor: ... + def descriptor(self) -> FlightDescriptor: + """The descriptor of the data in this flight.""" @property - def endpoints(self) -> list[FlightEndpoint]: ... + def endpoints(self) -> list[FlightEndpoint]: + """The endpoints where this flight is available.""" @property - def total_records(self) -> int: ... + def total_records(self) -> int: + """The total record count of this flight, or -1 if unknown.""" @property - def total_bytes(self) -> int: ... + def total_bytes(self) -> int: + """The size in bytes of the data in this flight, or -1 if unknown.""" @property - def ordered(self) -> bool: ... + def ordered(self) -> bool: + """Whether endpoints are in the same order as the data.""" @property - def app_metadata(self) -> bytes | str: ... + def app_metadata(self) -> bytes | str: + """ + Application-defined opaque metadata. + + There is no inherent or required relationship between this and the + app_metadata fields in the FlightEndpoints or resulting FlightData + messages. Since this metadata is application-defined, a given + application could define there to be a relationship, but there is + none required by the spec. + + """ def serialize(self) -> bytes: ... @classmethod def deserialize(cls, serialized: bytes) -> Self: ... class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" @property def data(self) -> RecordBatch | None: ... @property @@ -210,44 +432,136 @@ class FlightStreamChunk(_Weakrefable): def __iter__(self): ... class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + def __iter__(self) -> Self: ... def __next__(self) -> FlightStreamChunk: ... @property - def schema(self) -> Schema: ... - def read_all(self) -> Table: ... - def read_chunk(self) -> FlightStreamChunk: ... - def to_reader(self) -> RecordBatchReader: ... + def schema(self) -> Schema: + """Get the schema for this reader.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + def read_chunk(self) -> FlightStreamChunk: + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. -class MetadataRecordBatchReader(_MetadataRecordBatchReader): ... + Raises + ------ + StopIteration + when the stream is finished + """ + def to_reader(self) -> RecordBatchReader: + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ class FlightStreamReader(MetadataRecordBatchReader): - def cancel(self) -> None: ... - def read_all(self) -> Table: ... + """A reader that can also be canceled.""" + def cancel(self) -> None: + """Cancel the read operation.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" class MetadataRecordBatchWriter(_CRecordBatchWriter): - def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... - def write_metadata(self, buf: Buffer) -> None: ... - def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] - def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: ... - def close(self) -> None: ... - def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: ... + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: + """Prepare to write data to this stream with the given schema.""" + def write_metadata(self, buf: Buffer) -> None: + """Write Flight metadata by itself.""" + def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ class FlightStreamWriter(MetadataRecordBatchWriter): - def done_writing(self) -> None: ... + """A writer that also allows closing the write side of a stream.""" + def done_writing(self) -> None: + """Indicate that the client is done writing, but not done reading.""" class FlightMetadataReader(_Weakrefable): - def read(self) -> Buffer | None: ... + """A reader for Flight metadata messages sent during a DoPut.""" + def read(self) -> Buffer | None: + """Read the next metadata message.""" class FlightMetadataWriter(_Weakrefable): - def write(self, message: Buffer) -> None: ... + """A sender for Flight metadata messages during a DoPut.""" + def write(self, message: Buffer) -> None: + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ class AsyncioCall(Generic[_T]): + """State for an async RPC using asyncio.""" + _future: asyncio.Future[_T] def as_awaitable(self) -> asyncio.Future[_T]: ... def wakeup(self, result_or_exception: BaseException | _T) -> None: ... class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + def __init__(self, client: FlightClient) -> None: ... async def get_flight_info( self, @@ -257,6 +571,40 @@ class AsyncioFlightClient: ): ... class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ def __init__( self, location: str | tuple[str, int] | Location, @@ -273,7 +621,14 @@ class FlightClient(_Weakrefable): @property def supports_async(self) -> bool: ... def as_async(self) -> AsyncioFlightClient: ... - def wait_for_available(self, timeout: int = 5) -> None: ... + def wait_for_available(self, timeout: int = 5) -> None: + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ @deprecated( "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." ) @@ -286,112 +641,440 @@ class FlightClient(_Weakrefable): private_key: str | None = None, override_hostname: str | None = None, disable_server_verification: bool = False, - ) -> FlightClient: ... + ) -> FlightClient: + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ def authenticate( self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None - ) -> None: ... + ) -> None: + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ def authenticate_basic_token( self, username: str, password: str, options: FlightCallOptions | None = None - ) -> tuple[str, str]: ... - def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: ... + ) -> tuple[str, str]: + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: + """List the actions available on a service.""" def do_action( self, action: Action, options: FlightCallOptions | None = None - ) -> Iterator[Result]: ... + ) -> Iterator[Result]: + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ def list_flights( self, criteria: str | None = None, options: FlightCallOptions | None = None - ) -> Generator[FlightInfo, None, None]: ... + ) -> Generator[FlightInfo, None, None]: + """List the flights available on a service.""" def get_flight_info( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> FlightInfo: ... + ) -> FlightInfo: + """Request information about an available flight.""" def get_schema( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> Schema: ... + ) -> Schema: + """Request schema for an available flight.""" def do_get( self, ticket: Ticket, options: FlightCallOptions | None = None - ) -> FlightStreamReader: ... + ) -> FlightStreamReader: + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ def do_put( self, descriptor: FlightDescriptor, schema: Schema, options: FlightCallOptions | None = None, - ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ def do_exchange( self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... - def close(self) -> None: ... + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + def close(self) -> None: + """Close the client and disconnect.""" def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_value, traceback) -> None: ... -class FlightDataStream(_Weakrefable): ... +class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ def __init__( self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None - ) -> None: ... + ) -> None: + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" def __init__( self, schema: Schema, generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], options: IpcWriteOptions | None = None, - ) -> None: ... + ) -> None: + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ class ServerCallContext(_Weakrefable): - def peer_identity(self) -> bytes: ... - def peer(self) -> str: ... - def is_cancelled(self) -> bool: ... - def add_header(self, key: str, value: str) -> None: ... - def add_trailer(self, key: str, value: str) -> None: ... - def get_middleware(self, key: str) -> ServerMiddleware | None: ... + """Per-call state/context.""" + def peer_identity(self) -> bytes: + """Get the identity of the authenticated peer. + + May be the empty string. + """ + def peer(self) -> str: + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: + """Check if the current RPC call has been canceled by the client.""" + def add_header(self, key: str, value: str) -> None: + """Add a response header.""" + def add_trailer(self, key: str, value: str) -> None: + """Add a response trailer.""" + def get_middleware(self, key: str) -> ServerMiddleware | None: + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" def read(self) -> str: ... class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" def write(self, message: str) -> None: ... class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" def read(self) -> str: ... class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" def write(self, message: str) -> None: ... class ServerAuthHandler(_Weakrefable): - def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... - def is_valid(self, token: str) -> bool: ... + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + def is_valid(self, token: str) -> bool: + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ class ClientAuthHandler(_Weakrefable): - def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... - def get_token(self) -> str: ... + """Authentication plugin for a client.""" + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + def get_token(self) -> str: + """Get the auth token for a call.""" class CallInfo(NamedTuple): + """Information about a particular RPC for Flight middleware.""" + method: FlightMethod class ClientMiddlewareFactory(_Weakrefable): - def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + def start_call(self, info: CallInfo) -> ClientMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ class ClientMiddleware(_Weakrefable): - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... - def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... - def call_completed(self, exception: ArrowException): ... + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + def start_call( self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> ServerMiddleware | None: ... + ) -> ServerMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ -class TracingServerMiddlewareFactory(ServerMiddlewareFactory): ... +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ class ServerMiddleware(_Weakrefable): - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... - def call_completed(self, exception: ArrowException): ... + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ class TracingServerMiddleware(ServerMiddleware): trace_context: dict def __init__(self, trace_context: dict) -> None: ... class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... def start_call( # type: ignore[override] self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] @@ -403,9 +1086,47 @@ class _ServerMiddlewareWrapper(ServerMiddleware): def call_completed(self, exception: ArrowException) -> None: ... class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + def finalize(self) -> None: ... class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ def __init__( self, location: str | tuple[str, int] | Location | None = None, @@ -416,33 +1137,197 @@ class FlightServerBase(_Weakrefable): middleware: dict[str, ServerMiddlewareFactory] | None = None, ): ... @property - def port(self) -> int: ... - def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: ... + def port(self) -> int: + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ def get_flight_info( self, context: ServerCallContext, descriptor: FlightDescriptor - ) -> FlightInfo: ... - def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: ... + ) -> FlightInfo: + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ def do_put( self, context: ServerCallContext, descriptor: FlightDescriptor, reader: MetadataRecordBatchReader, writer: FlightMetadataWriter, - ) -> None: ... - def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: ... + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ def do_exchange( self, context: ServerCallContext, descriptor: FlightDescriptor, reader: MetadataRecordBatchReader, writer: MetadataRecordBatchWriter, - ) -> None: ... - def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... - def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: ... - def serve(self) -> None: ... - def run(self) -> None: ... - def shutdown(self) -> None: ... - def wait(self) -> None: ... + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + def serve(self) -> None: + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + def run(self) -> None: + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + def shutdown(self) -> None: + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + def wait(self) -> None: + """Block until server is terminated with shutdown.""" def __enter__(self) -> Self: ... def __exit__(self, exc_type, exc_value, traceback): ... @@ -457,4 +1342,39 @@ def connect( write_size_limit_bytes: int | None = None, disable_server_verification: bool = False, generic_options: list[tuple[str, int | str]] | None = None, -) -> FlightClient: ... +) -> FlightClient: + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 67d7a601123..7670ef5230d 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -28,6 +28,71 @@ class FileType(enum.IntFlag): Directory = enum.auto() class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + "/pyarrow-fs-example.dat" + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b"data") + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + def __init__( self, path: str, @@ -38,70 +103,697 @@ class FileInfo(_Weakrefable): size: int | None = None, ): ... @property - def type(self) -> FileType: ... + def type(self) -> FileType: + """ + Type of the file. + + The returned enum values can be the following: + + - FileType.NotFound: target does not exist + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character device, or + Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ @property def is_file(self) -> bool: ... @property - def path(self) -> str: ... + def path(self) -> str: + """ + The full file path in the filesystem. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ @property - def base_name(self) -> str: ... + def base_name(self) -> str: + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ @property - def size(self) -> int: ... + def size(self) -> int: + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ @property - def extension(self) -> str: ... + def extension(self) -> str: + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ @property - def mtime(self) -> dt.datetime | None: ... + def mtime(self) -> dt.datetime | None: + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ @property - def mtime_ns(self) -> int | None: ... + def mtime_ns(self) -> int | None: + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector( + ... local_path + "/missing", recursive=True, allow_not_found=True + ... ) + >>> local.get_file_info(selector_not_found) + [] + """ + base_dir: str allow_not_found: bool recursive: bool def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + @classmethod - def from_uri(cls, uri: str) -> tuple[Self, str]: ... - def equals(self, other: FileSystem) -> bool: ... + def from_uri(cls, uri: str) -> tuple[Self, str]: + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + def equals(self, other: FileSystem) -> bool: + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ @property - def type_name(self) -> str: ... + def type_name(self) -> str: + """ + The filesystem's type name. + """ @overload def get_file_info(self, paths_or_selector: str) -> FileInfo: ... @overload def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... - def create_dir(self, path: str, *, recursive: bool = True) -> None: ... - def delete_dir(self, path: str) -> None: ... + def get_file_info(self, paths_or_selector): + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + def create_dir(self, path: str, *, recursive: bool = True) -> None: + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + def delete_dir(self, path: str) -> None: + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ def delete_dir_contents( self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False - ) -> None: ... - def move(self, src: str, dest: str) -> None: ... - def copy_file(self, src: str, dest: str) -> None: ... - def delete_file(self, path: str) -> None: ... - def open_input_file(self, path: str) -> NativeFile: ... + ) -> None: + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + def move(self, src: str, dest: str) -> None: + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir("/tmp/other_dir") + >>> local.copy_file(path, "/tmp/move_example.dat") + + Move the file: + + >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") + + Inspect the file info: + + >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") + + >>> local.get_file_info("/tmp/move_example.dat") + + + Delete the folder: + >>> local.delete_dir("/tmp/other_dir") + """ + def copy_file(self, src: str, dest: str) -> None: + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") + + Inspect the file info: + + >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") + + >>> local.get_file_info(path) + + """ + def delete_file(self, path: str) -> None: + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + def open_input_file(self, path: str) -> NativeFile: + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ def open_input_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None - ) -> NativeFile: ... + ) -> NativeFile: + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ def open_output_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None, metadata: dict[str, str] | None = None, - ) -> NativeFile: ... + ) -> NativeFile: + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b"data") + 4 + """ def open_append_stream( self, path: str, compression: str | None = "detect", buffer_size: int | None = None, metadata: dict[str, str] | None = None, - ): ... - def normalize_path(self, path: str) -> str: ... + ): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b"+newly added") + 12 + + Print out the content to the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + def normalize_path(self, path: str) -> str: + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: + ... print(stream.readall()) + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: + ... f.write(b"+newly added") + 12 + + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.delete_dir("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.delete_dir_contents("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.delete_file("/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.get_file_info("/tmp/new_folder") + + + Move the file: + + >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") + + >>> local.get_file_info("/tmp/local_fs-copy.dat") + + + To finish delete the file left: + >>> local.delete_file("/tmp/local_fs.dat") + """ + def __init__(self, *, use_mmap: bool = False) -> None: ... class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir("/tmp/sub_tree") + >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) + + Write data into the existing file: + + >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: + ... f.write(b"+newly added") + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info("") + + >>> subtree.get_file_info("sub_tree_fs.dat") + + + Delete the file and directory: + + >>> subtree.delete_file("sub_tree_fs.dat") + >>> local.delete_dir("/tmp/sub_tree") + >>> local.delete_file("/tmp/local_fs.dat") + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ def __init__(self, base_path: str, base_fs: FileSystem): ... @property def base_path(self) -> str: ... @@ -112,38 +804,202 @@ class _MockFileSystem(FileSystem): def __init__(self, current_time: dt.datetime | None = None) -> None: ... class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info("README.md") # doctest: +SKIP + + """ def __init__(self, handler: FileSystemHandler) -> None: ... @property - def handler(self) -> FileSystemHandler: ... + def handler(self) -> FileSystemHandler: + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ @abstractmethod - def get_type_name(self) -> str: ... + def get_type_name(self) -> str: + """ + Implement PyFileSystem.type_name. + """ @abstractmethod - def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ @abstractmethod - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + @abstractmethod - def create_dir(self, path: str, recursive: bool) -> None: ... + def create_dir(self, path: str, recursive: bool) -> None: + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ @abstractmethod - def delete_dir(self, path: str) -> None: ... + def delete_dir(self, path: str) -> None: + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ @abstractmethod - def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ @abstractmethod - def delete_root_dir_contents(self) -> None: ... + def delete_root_dir_contents(self) -> None: + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ @abstractmethod - def delete_file(self, path: str) -> None: ... + def delete_file(self, path: str) -> None: + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ @abstractmethod - def move(self, src: str, dest: str) -> None: ... + def move(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + @abstractmethod - def copy_file(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ @abstractmethod - def open_input_stream(self, path: str) -> NativeFile: ... + def open_input_stream(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ @abstractmethod - def open_input_file(self, path: str) -> NativeFile: ... + def open_input_file(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ @abstractmethod - def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + @abstractmethod - def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ @abstractmethod - def normalize_path(self, path: str) -> str: ... + def normalize_path(self, path: str) -> str: + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi index f94370c51c1..4fc7ea68e48 100644 --- a/pyarrow-stubs/_gcsfs.pyi +++ b/pyarrow-stubs/_gcsfs.pyi @@ -4,6 +4,59 @@ from ._fs import FileSystem from .lib import KeyValueMetadata class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + def __init__( self, *, @@ -19,6 +72,12 @@ class GcsFileSystem(FileSystem): project_id: str | None = None, ): ... @property - def default_bucket_location(self) -> str: ... + def default_bucket_location(self) -> str: + """ + The GCP location this filesystem will write to. + """ @property - def project_id(self) -> str: ... + def project_id(self) -> str: + """ + The GCP project id this filesystem will use. + """ diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi index af8dc559501..200f669379b 100644 --- a/pyarrow-stubs/_hdfs.pyi +++ b/pyarrow-stubs/_hdfs.pyi @@ -3,6 +3,41 @@ from _typeshed import StrPath from ._fs import FileSystem class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem( + ... host, port, user=user, kerb_ticket=ticket_cache_path + ... ) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ def __init__( self, host: str, @@ -16,4 +51,25 @@ class HadoopFileSystem(FileSystem): extra_conf: dict | None = None, ): ... @staticmethod - def from_uri(uri: str) -> HadoopFileSystem: ... # type: ignore[override] + def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi index ce5b3a103dc..43d2ae83cd8 100644 --- a/pyarrow-stubs/_json.pyi +++ b/pyarrow-stubs/_json.pyi @@ -5,34 +5,165 @@ from _typeshed import StrPath from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + use_threads: bool + """ + Whether to use multiple threads to accelerate reading. + """ block_size: int + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... - def equals(self, other: ReadOptions) -> bool: ... + def equals(self, other: ReadOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + explicit_schema: Schema + """ + Optional explicit schema (no type inference, ignores other fields) + """ newlines_in_values: bool + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ unexpected_field_behavior: Literal["ignore", "error", "infer"] + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ def __init__( self, explicit_schema: Schema | None = None, newlines_in_values: bool | None = None, unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", ): ... - def equals(self, other: ParseOptions) -> bool: ... + def equals(self, other: ParseOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + +class JSONStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a JSON file. -class JSONStreamingReader(RecordBatchReader): ... + Should not be instantiated directly by user code. + """ def read_json( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, -) -> Table: ... +) -> Table: + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + def open_json( input_file: StrPath | IO[Any], read_options: ReadOptions | None = None, parse_options: ParseOptions | None = None, memory_pool: MemoryPool | None = None, -) -> JSONStreamingReader: ... +) -> JSONStreamingReader: + """ + Open a streaming reader of JSON data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of JSON data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see pyarrow.json.ReadOptions constructor + for defaults) + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see pyarrow.json.ParseOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.json.JSONStreamingReader` + """ diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi index 3eb0aec9a2d..8d8fc35b134 100644 --- a/pyarrow-stubs/compute.pyi +++ b/pyarrow-stubs/compute.pyi @@ -94,8 +94,68 @@ from . import lib _P = ParamSpec("_P") _R = TypeVar("_R") -def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: ... -def scalar(value: bool | float | str) -> Expression: ... +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + Expression: + """Expression representing a scalar value. + + Creates an Expression object representing a scalar value that can be used + in compute expressions and predicates. + + Parameters + ---------- + value : bool, int, float or string + Python value of the scalar. This function accepts any value that can be + converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. + + Notes + ----- + This function differs from ``pyarrow.scalar()`` in the following way: + + * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents + a single value in Arrow's memory model. + * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing + a scalar value that can be used in compute expressions, predicates, and + dataset filtering operations. + + Returns + ------- + scalar_expr : Expression + An Expression representing the scalar value + """ + def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... # ============= compute functions ============= @@ -197,9 +257,53 @@ def all( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... +) -> lib.BooleanScalar: + """ + Test whether all elements in a boolean array evaluate to true. + + Null values are ignored by default. + If the `skip_nulls` option is set to false, then Kleene logic is used. + See "kleene_and" for more details on Kleene logic. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ any = _clone_signature(all) +""" +Test whether any element in a boolean array evaluates to true. + +Null values are ignored by default. +If the `skip_nulls` option is set to false, then Kleene logic is used. +See "kleene_or" for more details on Kleene logic. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" def approximate_median( array: NumericScalar | NumericArray, @@ -209,7 +313,29 @@ def approximate_median( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: ... +) -> lib.DoubleScalar: + """ + Approximate median of a numeric array with T-Digest algorithm. + + Nulls and NaNs are ignored. + A null scalar is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def count( array: lib.Array | lib.ChunkedArray, /, @@ -217,7 +343,26 @@ def count( *, options: CountOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... +) -> lib.Int64Scalar: + """ + Count the number of null / non-null values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def count_distinct( array: lib.Array | lib.ChunkedArray, /, @@ -225,7 +370,26 @@ def count_distinct( *, options: CountOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... +) -> lib.Int64Scalar: + """ + Count the number of unique values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def first( array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], /, @@ -234,7 +398,30 @@ def first( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: ... +) -> _ScalarT: + """ + Compute the first value in each group. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def first_last( array: lib.Array[Any] | lib.ChunkedArray[Any], /, @@ -243,7 +430,30 @@ def first_last( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: ... +) -> lib.StructScalar: + """ + Compute the first and last values of an array. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def index( data: lib.Array[Any] | lib.ChunkedArray[Any], value, @@ -251,12 +461,139 @@ def index( end: int | None = None, *, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... +) -> lib.Int64Scalar: + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) + >>> pc.index(arr, "ipsum") + + >>> pc.index(arr, "ipsum", start=2) + + >>> pc.index(arr, "amet") + + """ last = _clone_signature(first) +""" +Compute the first and last values of an array. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True +In [15]: print(pc.last.__doc__) +Compute the first value in each group. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" max = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" min = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" min_max = _clone_signature(first_last) +""" +Compute the minimum and maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def mean( @@ -292,6 +629,34 @@ def mean( options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.Decimal256Scalar: ... +def mean(*args, **kwargs): + """ + Compute the mean of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + The result is a double for integer and floating point arguments, + and a decimal with the same bit-width/precision/scale for decimal arguments. + For integers and floats, NaN is returned if min_count = 0 and + there are no values. For decimals, null is returned instead. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def mode( array: NumericScalar | NumericArray, /, @@ -301,16 +666,79 @@ def mode( min_count: int = 0, options: ModeOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... +) -> lib.StructArray: + """ + Compute the modal (most common) values of a numeric array. + + Compute the n most common values and their respective occurrence counts. + The output has type `struct`, where T is the + input type. + The results are ordered by descending `count` first, and ascending `mode` + when breaking ties. + Nulls are ignored. If there are no non-null values in the array, + an empty array is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ModeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ + def product( array: _ScalarT | lib.NumericArray[_ScalarT], /, *, - skip_nulls=True, - min_count=1, - options=None, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: ... +) -> _ScalarT: + """ + Compute the product of values in a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def quantile( array: NumericScalar | NumericArray, /, @@ -321,7 +749,44 @@ def quantile( min_count: int = 0, options: QuantileOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: ... +) -> lib.DoubleArray: + """ + Compute an array of quantiles of a numeric array or chunked array. + + By default, 0.5 quantile (median) is returned. + If quantile lies between two data points, an interpolated value is + returned based on selected interpolation method. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.QuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def stddev( array: NumericScalar | NumericArray, /, @@ -331,7 +796,33 @@ def stddev( min_count: int = 0, options: VarianceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: ... +) -> lib.DoubleScalar: + """ + Calculate the standard deviation of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population standard deviation is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def sum( array: _NumericScalarT | NumericArray[_NumericScalarT], /, @@ -340,7 +831,30 @@ def sum( min_count: int = 1, options: ScalarAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... +) -> _NumericScalarT: + """ + Compute the sum of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def tdigest( array: NumericScalar | NumericArray, /, @@ -352,7 +866,38 @@ def tdigest( min_count: int = 0, options: TDigestOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: ... +) -> lib.DoubleArray: + """ + Approximate quantiles of a numeric array with T-Digest algorithm. + + By default, 0.5 quantile (median) is returned. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.TDigestOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + def variance( array: NumericScalar | NumericArray, /, @@ -362,7 +907,120 @@ def variance( min_count: int = 0, options: VarianceOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: ... +) -> lib.DoubleScalar: + """ + Calculate the variance of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population variance is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ # ========================= 2. Element-wise (“scalar”) functions ========================= @@ -377,8 +1035,36 @@ def abs( ) -> _NumericOrDurationArrayT: ... @overload def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def abs(*args, **kwargs): + """ + Calculate the absolute value of the argument element-wise. + + Results will wrap around on integer overflow. + Use function "abs_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ abs_checked = _clone_signature(abs) +""" +Calculate the absolute value of the argument element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "abs". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def add( @@ -424,8 +1110,41 @@ def add( def add( x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def add(*args, **kwargs): + """ + Add the arguments element-wise. + + Results will wrap around on integer overflow. + Use function "add_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ add_checked = _clone_signature(add) +""" +Add the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "add". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +""" @overload def divide( @@ -483,8 +1202,42 @@ def divide( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def divide(*args, **kwargs): + """ + Divide the arguments element-wise. + + Integer division by zero returns an error. However, integer overflow + wraps around, and floating-point division by zero returns an infinite. + Use function "divide_checked" if you want to get an error + in all the aforementioned cases. + + Parameters + ---------- + dividend : Array-like or scalar-like + Argument to compute function. + divisor : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ divide_checked = _clone_signature(divide) +""" +Divide the arguments element-wise. + +An error is returned when trying to divide by zero, or when +integer overflow is encountered. + +Parameters +---------- +dividend : Array-like or scalar-like + Argument to compute function. +divisor : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def exp( @@ -507,9 +1260,53 @@ def exp( ) -> lib.DoubleScalar: ... @overload def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def exp(*args, **kwargs): + """ + Compute Euler's number raised to the power of specified exponent, element-wise. + + If exponent is null the result will be null. + + Parameters + ---------- + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ multiply = _clone_signature(add) +""" +Multiply the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "multiply_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" multiply_checked = _clone_signature(add) +""" +Multiply the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "multiply". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def negate( @@ -521,8 +1318,36 @@ def negate( ) -> _NumericOrDurationArrayT: ... @overload def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def negate(*args, **kwargs): + """ + Negate the argument element-wise. + + Results will wrap around on integer overflow. + Use function "negate_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ negate_checked = _clone_signature(negate) +""" +Negate the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "negate". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def power( @@ -580,8 +1405,39 @@ def power( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def power(*args, **kwargs): + """ + Raise arguments to power element-wise. + + Integer to negative integer power returns an error. However, integer overflow + wraps around. If either base or exponent is null the result will be null. + + Parameters + ---------- + base : Array-like or scalar-like + Argument to compute function. + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ power_checked = _clone_signature(power) +""" +Raise arguments to power element-wise. + +An error is returned when integer to negative integer power is encountered, +or integer overflow is encountered. + +Parameters +---------- +base : Array-like or scalar-like + Argument to compute function. +exponent : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def sign( @@ -597,17 +1453,93 @@ def sign( ) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... @overload def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sign(*args, **kwargs): + """ + Get the signedness of the arguments element-wise. + + Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + @overload def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... @overload def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... @overload def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sqrt(*args, **kwargs): + """ + Takes the square root of arguments element-wise. + + A negative argument returns a NaN. For a variant that returns an + error, use function "sqrt_checked". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ sqrt_checked = _clone_signature(sqrt) +""" +Takes the square root of arguments element-wise. + +A negative argument returns an error. For a variant that returns a +NaN, use function "sqrt". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" subtract = _clone_signature(add) +""" +Subtract the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "subtract_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" subtract_checked = _clone_signature(add) +""" +Subtract the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "subtract". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.1 Bit-wise functions ========================= @overload @@ -654,6 +1586,22 @@ def bit_wise_and( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def bit_wise_and(*args, **kwargs): + """ + Bit-wise AND the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def bit_wise_not( x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None @@ -664,13 +1612,128 @@ def bit_wise_not( ) -> _NumericArrayT: ... @overload def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def bit_wise_not(*args, **kwargs): + """ + Bit-wise negate the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ bit_wise_or = _clone_signature(bit_wise_and) +""" +Bit-wise OR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" bit_wise_xor = _clone_signature(bit_wise_and) +""" +Bit-wise XOR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" shift_left = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +`x` is returned if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_left_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" shift_left_checked = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_left" for a variant that doesn't fail for an invalid shift amount. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" shift_right = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +`x` is returned if `y` (the amount to shift by) is: (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_right_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" shift_right_checked = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_right" for a variant that doesn't fail for an invalid shift amount + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.2 Rounding functions ========================= @overload @@ -679,8 +1742,33 @@ def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... @overload def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ceil(*args, **kwargs): + """ + Round up to the nearest integer. + + Compute the smallest integer value not less in magnitude than `x`. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ floor = _clone_signature(ceil) +""" +Round down to the nearest integer. + +Compute the largest integer value not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def round( @@ -745,6 +1833,31 @@ def round( options: RoundOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def round(*args, **kwargs): + """ + Round to a given precision. + + Options are used to control the number of digits and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def round_to_multiple( x: _NumericScalarT, @@ -808,6 +1921,32 @@ def round_to_multiple( options: RoundToMultipleOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def round_to_multiple(*args, **kwargs): + """ + Round to a given multiple. + + Options are used to control the rounding multiple and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundToMultipleOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def round_binary( x: _NumericScalarT, @@ -892,8 +2031,43 @@ def round_binary( options: RoundBinaryOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def round_binary(*args, **kwargs): + """ + Round to the given precision. + + Options are used to control the rounding mode. + Default behavior is to use the half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + s : Array-like or scalar-like + Argument to compute function. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundBinaryOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ trunc = _clone_signature(ceil) +""" +Compute the integral part. + +Compute the nearest integer not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.3 Logarithmic functions ========================= @overload @@ -906,14 +2080,125 @@ def ln( ) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... @overload def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ln(*args, **kwargs): + """ + Compute natural logarithm. + + Non-positive values return -inf or NaN. Null values return null. + Use function "ln_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ln_checked = _clone_signature(ln) +""" +Compute natural logarithm. + +Non-positive values raise an error. Null values return null. +Use function "ln" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log10 = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log10_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log10_checked = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log10" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log1p = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p_checked" if you want invalid values to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log1p_checked = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p" if you want invalid values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log2 = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log2_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" log2_checked = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log2" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def logb( @@ -943,21 +2228,195 @@ def logb( def logb( x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression | Any: ... +def logb(*args, **kwargs): + """ + Compute base `b` logarithm. + + Values <= 0 return -inf or NaN. Null values return null. + Use function "logb_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + b : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ logb_checked = _clone_signature(logb) +""" +Compute base `b` logarithm. + +Values <= 0 return -inf or NaN. Null values return null. +Use function "logb" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +b : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.4 Trigonometric functions ========================= acos = _clone_signature(ln) +""" +Compute the inverse cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "acos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" acos_checked = _clone_signature(ln) +""" +Compute the inverse cosine. + +Invalid input values raise an error; +to return NaN instead, see "acos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" asin = _clone_signature(ln) +""" +Compute the inverse sine. + +NaN is returned for invalid input values; +to raise an error instead, see "asin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" asin_checked = _clone_signature(ln) +""" +Compute the inverse sine. + +Invalid input values raise an error; +to return NaN instead, see "asin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" atan = _clone_signature(ln) +""" +Compute the inverse tangent of x. + +The return value is in the range [-pi/2, pi/2]; +for a full return range [-pi, pi], see "atan2". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cos = _clone_signature(ln) +""" +Compute the cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "cos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cos_checked = _clone_signature(ln) +""" +Compute the cosine. + +Infinite values raise an error; +to return NaN instead, see "cos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" sin = _clone_signature(ln) +""" +Compute the sine. + +NaN is returned for invalid input values; +to raise an error instead, see "sin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" sin_checked = _clone_signature(ln) +""" +Compute the sine. + +Invalid input values raise an error; +to return NaN instead, see "sin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" tan = _clone_signature(ln) +""" +Compute the tangent. + +NaN is returned for invalid input values; +to raise an error instead, see "tan_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" tan_checked = _clone_signature(ln) +""" +Compute the tangent. + +Infinite values raise an error; +to return NaN instead, see "tan". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def atan2( @@ -991,6 +2450,21 @@ def atan2( def atan2( y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def atan2(*args, **kwargs): + """ + Compute the inverse tangent of y/x. + + The return value is in the range [-pi, pi]. + + Parameters + ---------- + y : Array-like or scalar-like + Argument to compute function. + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.5 Comparisons functions ========================= @overload @@ -1045,12 +2519,97 @@ def equal( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def equal(*args, **kwargs): + """ + Compare values for equality (x == y). + + A null on either side emits a null comparison result. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ greater = _clone_signature(equal) +""" +Compare values for ordered inequality (x > y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" greater_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x >= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" less = _clone_signature(equal) +""" +Compare values for ordered inequality (x < y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" less_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x <= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" not_equal = _clone_signature(equal) +""" +Compare values for inequality (x != y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def max_element_wise( @@ -1066,8 +2625,45 @@ def max_element_wise( options: ElementWiseAggregateOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def max_element_wise(*args, **kwargs): + """ + Find the element-wise maximum value. + + Nulls are ignored (by default) or propagated. + NaN is preferred over null, but not over any valid value. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ min_element_wise = _clone_signature(max_element_wise) +""" +Find the element-wise minimum value. + +Nulls are ignored (by default) or propagated. +NaN is preferred over null, but not over any valid value. + +Parameters +---------- +*args : Array-like or scalar-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.6 Logical functions ========================= @overload @@ -1130,13 +2726,146 @@ def and_( *, memory_pool: lib.MemoryPool | None = None, ) -> ScalarOrArray[lib.BooleanScalar]: ... +def and_(*args, **kwargs): + """ + Logical 'and' boolean values. + + When a null is encountered in either input, a null is output. + For a different null behavior, see function "and_kleene". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ and_kleene = _clone_signature(and_) +""" +Logical 'and' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and null = null +- null and true = null +- false and null = false +- null and false = false +- null and null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and' false is always false. +For a different null behavior, see function "and". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" and_not = _clone_signature(and_) +""" +Logical 'and not' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "and_not_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" and_not_kleene = _clone_signature(and_) +""" +Logical 'and not' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and not null = null +- null and not false = null +- false and not null = false +- null and not true = false +- null and not null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and not' true is always false, as is false +'and not' an unknown value. +For a different null behavior, see function "and_not". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" or_ = _clone_signature(and_) +""" +Logical 'or' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" or_kleene = _clone_signature(and_) +""" +Logical 'or' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true or null = true +- null or true = true +- false or null = null +- null or false = null +- null or null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'or' true is always true. +For a different null behavior, see function "or". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" xor = _clone_signature(and_) +""" +Logical 'xor' boolean values. + +When a null is encountered in either input, a null is output. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def invert( @@ -1156,6 +2885,17 @@ def invert( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def invert(*args, **kwargs): + """ + Invert boolean values. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.10 String predicates ========================= @overload @@ -1170,25 +2910,277 @@ def ascii_is_alnum( def ascii_is_alnum( strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def ascii_is_alnum(*args, **kwargs): + """ + Classify strings as ASCII alphanumeric. + + For each string in `strings`, emit true iff the string is non-empty + and consists only of alphanumeric ASCII characters. Null strings emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ascii_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_alnum = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphanumeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphanumeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_digit = _clone_signature(ascii_is_alnum) +""" +Classify strings as digits. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of Unicode digits. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_numeric = _clone_signature(ascii_is_alnum) +""" +Classify strings as numeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of numeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" string_is_ascii = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII. + +For each string in `strings`, emit true iff the string consists only +of ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.11 String transforms ========================= @overload @@ -1203,12 +3195,105 @@ def ascii_capitalize( def ascii_capitalize( strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def ascii_capitalize(*args, **kwargs): + """ + Capitalize the first character of ASCII input. + + For each string in `strings`, return a capitalized version. + + This function assumes the input is fully ASCII. If it may contain + non-ASCII characters, use "utf8_capitalize" instead. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ascii_lower = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to lowercase. + +For each string in `strings`, return a lowercase version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_lower" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_reverse = _clone_signature(ascii_capitalize) +""" +Reverse ASCII input. + +For each ASCII string in `strings`, return a reversed version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_reverse" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_swapcase = _clone_signature(ascii_capitalize) +""" +Transform ASCII input by inverting casing. + +For each string in `strings`, return a string with opposite casing. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_swapcase" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_title = _clone_signature(ascii_capitalize) +""" +Titlecase each word of ASCII input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_title" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_upper = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to uppercase. + +For each string in `strings`, return an uppercase version. + +This function assumes the input is fully ASCII. It it may contain +non-ASCII characters, use "utf8_upper" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def binary_length( @@ -1248,6 +3333,21 @@ def binary_length( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def binary_length(*args, **kwargs): + """ + Compute string lengths. + + For each string in `strings`, emit its length of bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def binary_repeat( strings: _StringOrBinaryScalarT, @@ -1280,6 +3380,22 @@ def binary_repeat( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def binary_repeat(*args, **kwargs): + """ + Repeat a binary string. + + For each binary string in `strings`, return a replicated version. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + num_repeats : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def binary_replace_slice( strings: _StringOrBinaryScalarT, @@ -1313,6 +3429,31 @@ def binary_replace_slice( options: ReplaceSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def binary_replace_slice(*args, **kwargs): + """ + Replace a slice of a binary string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def binary_reverse( strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None @@ -1325,6 +3466,22 @@ def binary_reverse( def binary_reverse( strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def binary_reverse(*args, **kwargs): + """ + Reverse binary input. + + For each binary string in `strings`, return a reversed version. + + This function reverses the binary data at a byte-level. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def replace_substring( strings: _StringScalarT, @@ -1358,8 +3515,59 @@ def replace_substring( options: ReplaceSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def replace_substring(*args, **kwargs): + """ + Replace matching non-overlapping substrings with replacement. + + For each string in `strings`, replace non-overlapping substrings that match + the given literal `pattern` with the given `replacement`. + If `max_replacements` is given and not equal to -1, it limits the + maximum amount replacements per input, counted from the left. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ replace_substring_regex = _clone_signature(replace_substring) +""" +Replace matching non-overlapping substrings with replacement. + +For each string in `strings`, replace non-overlapping substrings that match +the given regular expression `pattern` with the given `replacement`. +If `max_replacements` is given and not equal to -1, it limits the +maximum amount replacements per input, counted from the left. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +replacement : str + What to replace the pattern with. +max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). +options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def utf8_capitalize( @@ -1373,6 +3581,21 @@ def utf8_capitalize( def utf8_capitalize( strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def utf8_capitalize(*args, **kwargs): + """ + Capitalize the first character of input. + + For each string in `strings`, return a capitalized version, + with the first character uppercased and the others lowercased. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def utf8_length( strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None @@ -1405,8 +3628,34 @@ def utf8_length( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def utf8_length(*args, **kwargs): + """ + Compute UTF8 string lengths. + + For each string in `strings`, emit its length in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ utf8_lower = _clone_signature(utf8_capitalize) +""" +Transform input to lowercase. + +For each string in `strings`, return a lowercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def utf8_replace_slice( @@ -1441,11 +3690,89 @@ def utf8_replace_slice( options: ReplaceSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def utf8_replace_slice(*args, **kwargs): + """ + Replace a slice of a string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ utf8_reverse = _clone_signature(utf8_capitalize) +""" +Reverse input. + +For each string in `strings`, return a reversed version. + +This function operates on Unicode codepoints, not grapheme +clusters. Hence, it will not correctly reverse grapheme clusters +composed of multiple codepoints. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_swapcase = _clone_signature(utf8_capitalize) +""" +Transform input lowercase characters to uppercase and uppercase characters to lowercase. + +For each string in `strings`, return an opposite case version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_title = _clone_signature(utf8_capitalize) +""" +Titlecase each word of input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_upper = _clone_signature(utf8_capitalize) +""" +Transform input to uppercase. + +For each string in `strings`, return an uppercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory poo +""" # ========================= 2.12 String padding ========================= @overload @@ -1481,12 +3808,157 @@ def ascii_center( options: PadOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ascii_center(*args, **kwargs): + """ + Center strings by padding with a given character. + + For each string in `strings`, emit a centered string by padding both sides + with the given ASCII character. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ascii_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_center = _clone_signature(ascii_center) +""" +Center strings by padding with a given character. + +For each string in `strings`, emit a centered string by padding both sides +with the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.13 String trimming ========================= @overload @@ -1516,12 +3988,127 @@ def ascii_ltrim( options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ascii_ltrim(*args, **kwargs): + """ + Trim leading characters. + + For each string in `strings`, remove any leading characters + from the `characters` option (as given in TrimOptions). + Null values emit null. + Both the `strings` and the `characters` are interpreted as + ASCII; to trim non-ASCII characters, use `utf8_ltrim`. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + characters : str + Individual characters to be trimmed from the string. + options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ascii_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_rtrim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_trim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_ltrim = _clone_signature(ascii_ltrim) +""" +Trim leading characters. + +For each string in `strings`, remove any leading characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def ascii_ltrim_whitespace( @@ -1547,12 +4134,97 @@ def ascii_ltrim_whitespace( options: TrimOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ascii_ltrim_whitespace(*args, **kwargs): + """ + Trim leading ASCII whitespace characters. + + For each string in `strings`, emit a string with leading ASCII whitespace + characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode + whitespace characters. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with trailing ASCII whitespace +characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with leading and trailing ASCII +whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading whitespace characters. + +For each string in `strings`, emit a string with leading whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing whitespace characters. + +For each string in `strings`, emit a string with trailing whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing whitespace characters. + +For each string in `strings`, emit a string with leading and trailing +whitespace characters removed, where whitespace characters are defined +by the Unicode standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.14 String splitting ========================= @overload @@ -1585,6 +4257,32 @@ def ascii_split_whitespace( options: SplitOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ascii_split_whitespace(*args, **kwargs): + """ + Split string according to any ASCII whitespace. + + Split each string according any non-zero length sequence of ASCII + whitespace characters. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def split_pattern( strings: _StringOrBinaryScalarT, @@ -1618,9 +4316,86 @@ def split_pattern( options: SplitPatternOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def split_pattern(*args, **kwargs): + """ + Split string according to separator. + + Split each string according to the exact `pattern` defined in + SplitPatternOptions. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitPatternOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ split_pattern_regex = _clone_signature(split_pattern) +""" +Split string according to regex pattern. + +Split each string according to the regex `pattern` defined in +SplitPatternOptions. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitPatternOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + String pattern to split on. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" utf8_split_whitespace = _clone_signature(ascii_split_whitespace) +""" +Split string according to any Unicode whitespace. + +Split each string according any non-zero length sequence of Unicode +whitespace characters. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.15 String component extraction ========================= @overload @@ -1650,11 +4425,50 @@ def extract_regex( options: ExtractRegexOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def extract_regex(*args, **kwargs): + """ + Extract substrings captured by a regex pattern. + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. If the input is null or the + regular expression fails matching, a null output value is emitted. + + Regular expression matching is done using the Google RE2 library. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.16 String join ========================= def binary_join( strings, separator, /, *, memory_pool: lib.MemoryPool | None = None -) -> StringScalar | StringArray: ... +) -> StringScalar | StringArray: + """ + Join a list of strings together with a separator. + + Concatenate the strings in `list`. The `separator` is inserted + between each given string. + Any null input and any null `list` element emits a null output. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + separator : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def binary_join_element_wise( *strings: _StringOrBinaryScalarT, @@ -1679,6 +4493,30 @@ def binary_join_element_wise( options: JoinOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def binary_join_element_wise(*args, **kwargs): + """ + Join string arguments together, with the last argument as separator. + + Concatenate the `strings` except for the last one. The last argument + in `strings` is inserted between each given string. + Any null separator element emits a null output. Null elements either + emit a null (the default), are skipped, or replaced with a given string. + + Parameters + ---------- + *strings : Array-like or scalar-like + Argument to compute function. + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + options : pyarrow.compute.JoinOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.17 String Slicing ========================= @overload @@ -1714,6 +4552,35 @@ def binary_slice( options: SliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def binary_slice(*args, **kwargs): + """ + Slice binary string. + + For each binary string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + bytes. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def utf8_slice_codeunits( strings: _StringScalarT, @@ -1747,6 +4614,34 @@ def utf8_slice_codeunits( options: SliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def utf8_slice_codeunits(*args, **kwargs): + """ + Slice string. + + For each string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + UTF8 codeunits. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.18 Containment tests ========================= @overload @@ -1805,8 +4700,49 @@ def count_substring( options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def count_substring(*args, **kwargs): + """ + Count occurrences of substring. + + For each string in `strings`, emit the number of occurrences of the given + literal pattern. + Null inputs emit null. The pattern must be given in MatchSubstringOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ count_substring_regex = _clone_signature(count_substring) +""" +Count occurrences of substring. + +For each string in `strings`, emit the number of occurrences of the given +regular expression pattern. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def ends_with( @@ -1838,9 +4774,72 @@ def ends_with( options: MatchSubstringOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ends_with(*args, **kwargs): + """ + Check if strings end with a literal pattern. + + For each string in `strings`, emit true iff it ends with a given pattern. + The pattern must be given in MatchSubstringOptions. + If ignore_case is set, only simple case folding is performed. + + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ find_substring = _clone_signature(count_substring) +""" +Find first occurrence of substring. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" find_substring_regex = _clone_signature(count_substring) +""" +Find location of first match of regex pattern. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def index_in( @@ -1872,6 +4871,32 @@ def index_in( options: SetLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def index_in(*args, **kwargs): + """ + Return index of each element in a set of values. + + For each element in `values`, return its index in a given set of + values, or null if it is not found there. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def is_in( values: lib.Scalar, @@ -1902,11 +4927,123 @@ def is_in( options: SetLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def is_in(*args, **kwargs): + """ + Find each element in a set of values. + + For each element in `values`, return true if it is found in a given + set of values, false otherwise. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ match_like = _clone_signature(ends_with) +""" +Match strings against SQL-style LIKE pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. '%' will match any number of characters, '_' will +match exactly one character, and any other character matches itself. +To match a literal '%', '_', or '\', precede the character with a backslash. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" match_substring = _clone_signature(ends_with) +""" +Match strings against literal pattern. + +For each string in `strings`, emit true iff it contains a given pattern. +Null inputs emit null. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" match_substring_regex = _clone_signature(ends_with) +""" +Match strings against regex pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" starts_with = _clone_signature(ends_with) +""" +Check if strings start with a literal pattern. + +For each string in `strings`, emit true iff it starts with a given pattern. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.19 Categorizations ========================= @overload @@ -1921,9 +5058,47 @@ def is_finite( def is_finite( values: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def is_finite(*args, **kwargs): + """ + Return true if value is finite. + + For each input value, emit true iff the value is finite + (i.e. neither NaN, inf, nor -inf). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ is_inf = _clone_signature(is_finite) +""" +Return true if infinity. + +For each input value, emit true iff the value is infinite (inf or -inf). + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" is_nan = _clone_signature(is_finite) +""" +Return true if NaN. + +For each input value, emit true iff the value is NaN. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def is_null( @@ -1952,6 +5127,25 @@ def is_null( options: NullOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def is_null(*args, **kwargs): + """ + Return true if null (and optionally NaN). + + For each input value, emit true iff the value is null. + True may also be emitted for NaN values by setting the `nan_is_null` flag. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + options : pyarrow.compute.NullOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def is_valid( values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None @@ -1964,17 +5158,152 @@ def is_valid( def is_valid( values: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def is_valid(*args, **kwargs): + """ + Return true if non-null. + + For each input value, emit true iff the value is valid (i.e. non-null). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ true_unless_null = _clone_signature(is_valid) +""" +Return true if non-null, else return null. + +For each input value, emit true iff the value +is valid (non-null), otherwise emit null. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.20 Selecting / multiplexing ========================= -def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): ... -def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): ... +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): + """ + Choose values based on multiple conditions. + + `cond` must be a struct of Boolean values. `cases` can be a mix + of scalar and array arguments (of any type, but all must be the + same type or castable to a common type), with either exactly one + datum per child of `cond`, or one more `cases` than children of + `cond` (in which case we have an "else" value). + + Each row of the output will be the corresponding value of the + first datum in `cases` for which the corresponding child of `cond` + is true, or otherwise the "else" value (if given), or null. + + Essentially, this implements a switch-case or if-else, if-else... statement. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + *cases : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): + """ + Choose values from several arrays. + + For each row, the value of the first argument is used as a 0-based index + into the list of `values` arrays (i.e. index 0 selects the first of the + `values` arrays). The output value is the corresponding value of the + selected argument. + + If an index is null, the output will be null. + + Parameters + ---------- + indices : Array-like or scalar-like + Argument to compute function. + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def coalesce( *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None -) -> _ScalarOrArrayT: ... +) -> _ScalarOrArrayT: + """ + Select the first non-null value. + + Each row of the output will be the value from the first corresponding input + for which the value is not null. If all inputs are null in a row, the output + will be null. + + Parameters + ---------- + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ fill_null = coalesce +"""Replace each null element in values with a corresponding +element from fill_value. + +If fill_value is scalar-like, then every null element in values +will be replaced with fill_value. If fill_value is array-like, +then the i-th element in values will be replaced with the i-th +element in fill_value. + +The fill_value's type must be the same as that of values, or it +must be able to be implicitly casted to the array's type. + +This is an alias for :func:`coalesce`. + +Parameters +---------- +values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. +fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + +Returns +------- +result : depends on inputs + Values with all null elements replaced + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array([1, 2, None, 3], type=pa.int8()) +>>> fill_value = pa.scalar(5, type=pa.int8()) +>>> arr.fill_null(fill_value) + +[ + 1, + 2, + 5, + 3 +] +>>> arr = pa.array([1, 2, None, 4, None]) +>>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + +[ + 1, + 2, + 30, + 4, + 50 +] +""" def if_else( cond: ArrayLike | ScalarLike, @@ -1983,7 +5312,25 @@ def if_else( /, *, memory_pool: lib.MemoryPool | None = None, -) -> ArrayLike | ScalarLike: ... +) -> ArrayLike | ScalarLike: + """ + Choose values based on a condition. + + `cond` must be a Boolean scalar/ array. + `left` or `right` must be of the same type scalar/ array. + `null` values in `cond` will be promoted to the output. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + left : Array-like or scalar-like + Argument to compute function. + right : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.21 Structural transforms ========================= @@ -2015,6 +5362,22 @@ def list_value_length( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def list_value_length(*args, **kwargs): + """ + Compute list lengths. + + `lists` must have a list-like type. + For each non-null value in `lists`, its length is emitted. + Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def make_struct( *args: lib.Scalar, @@ -2042,6 +5405,29 @@ def make_struct( options: MakeStructOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def make_struct(*args, **kwargs): + """ + Wrap Arrays into a StructArray. + + Names of the StructArray's fields are + specified through MakeStructOptions. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + options : pyarrow.compute.MakeStructOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.22 Conversions ========================= @overload @@ -2119,9 +5505,163 @@ def ceil_temporal( options: RoundTemporalOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def ceil_temporal(*args, **kwargs): + """ + Round temporal values up to nearest multiple of specified time unit. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ floor_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values down to nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" round_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values to the nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def cast( @@ -2147,6 +5687,61 @@ def cast( options: CastOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +def cast(*args, **kwargs): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp("ms")) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp("ms")).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast("timestamp[ms]") + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast("timestamp[ms]").type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + @overload def strftime( timestamps: TemporalScalar, @@ -2177,6 +5772,35 @@ def strftime( options: StrftimeOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def strftime(*args, **kwargs): + """ + Format temporal values according to a format string. + + For each input value, emit a formatted string. + The time format string and locale can be set using StrftimeOptions. + The output precision of the "%S" (seconds) format code depends on + the input time precision: it is an integer for timestamps with + second precision, a real number with the required number of fractional + digits for higher precisions. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database, or if the specified locale + does not exist on this system. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + options : pyarrow.compute.StrftimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def strptime( strings: StringScalar, @@ -2210,6 +5834,34 @@ def strptime( options: StrptimeOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def strptime(*args, **kwargs): + """ + Parse timestamps. + + For each string in `strings`, parse it as a timestamp. + The timestamp unit and the expected string pattern must be given + in StrptimeOptions. Null inputs emit null. If a non-null string + fails parsing, an error is returned by default. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + options : pyarrow.compute.StrptimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.23 Temporal component extraction ========================= @overload @@ -2222,6 +5874,22 @@ def day( ) -> lib.Int64Array: ... @overload def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def day(*args, **kwargs): + """ + Extract day number. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def day_of_week( values: TemporalScalar, @@ -2252,8 +5920,50 @@ def day_of_week( options: DayOfWeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def day_of_week(*args, **kwargs): + """ + Extract day of the week number. + + By default, the week starts on Monday represented by 0 and ends on Sunday + represented by 6. + `DayOfWeekOptions.week_start` can be used to set another starting day using + the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). + Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ day_of_year = _clone_signature(day) +""" +Extract day of year number. + +January 1st maps to day number 1, February 1st to 32, etc. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def hour( @@ -2281,6 +5991,22 @@ def hour( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def hour(*args, **kwargs): + """ + Extract hour value. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def is_dst( values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None @@ -2294,6 +6020,23 @@ def is_dst( ) -> lib.BooleanArray: ... @overload def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def is_dst(*args, **kwargs): + """ + Extracts if currently observing daylight savings. + + IsDaylightSavings returns true if a timestamp has a daylight saving + offset in the given timezone. + Null values emit null. + An error is returned if the values do not have a defined timezone. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def iso_week( values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None @@ -2309,8 +6052,41 @@ def iso_week( def iso_week( values: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def iso_week(*args, **kwargs): + """ + Extract ISO week of year number. + + First ISO week has the majority (4 or more) of its days in January. + ISO week starts on Monday. The week number starts with 1 and can run + up to 53. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ iso_year = _clone_signature(iso_week) +""" +Extract ISO year number. + +First week of an ISO year has the majority (4 or more) of its days in January. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def is_leap_year( @@ -2338,18 +6114,199 @@ def is_leap_year( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def is_leap_year(*args, **kwargs): + """ + Extract if year is a leap year. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ microsecond = _clone_signature(iso_week) +""" +Extract microsecond values. + +Microsecond returns number of microseconds since the last full millisecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" millisecond = _clone_signature(iso_week) +""" +Extract millisecond values. + +Millisecond returns number of milliseconds since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" minute = _clone_signature(iso_week) +""" +Extract minute values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" month = _clone_signature(day_of_week) +""" +Extract month number. + +Month is encoded as January=1, December=12. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" nanosecond = _clone_signature(hour) +""" +Extract nanosecond values. + +Nanosecond returns number of nanoseconds since the last full microsecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" quarter = _clone_signature(day_of_week) +""" +Extract quarter of year number. + +First quarter maps to 1 and forth quarter maps to 4. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" second = _clone_signature(hour) +""" +Extract second values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" subsecond = _clone_signature(hour) +""" +Extract subsecond values. + +Subsecond returns the fraction of a second since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" us_week = _clone_signature(iso_week) +""" +Extract US week of year number. + +First US week has the majority (4 or more) of its days in January. +US week starts on Monday. The week number starts with 1 and can run +up to 53. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" us_year = _clone_signature(iso_week) +""" +Extract US epidemiological year number. + +First week of US epidemiological year has the majority (4 or more) of +it's days in January. Last week of US epidemiological year has the +year's last Wednesday in it. US epidemiological week starts on Sunday. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" year = _clone_signature(iso_week) +""" +Extract year number. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" @overload def week( @@ -2384,6 +6341,37 @@ def week( options: WeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def week(*args, **kwargs): + """ + Extract week of year number. + + First week has the majority (4 or more) of its days in January. + Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using + DayOfWeekOptions.count_from_zero. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + options : pyarrow.compute.WeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def year_month_day( values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None @@ -2396,26 +6384,241 @@ def year_month_day( def year_month_day( values: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def year_month_day(*args, **kwargs): + """ + Extract (year, month, day) struct. + + Null values emit null. + An error is returned in the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.24 Temporal difference ========================= -def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of days and milliseconds between two timestamps. + + Returns the number of days and milliseconds from `start` to `end`. + That is, first the difference in days is computed as if both + timestamps were truncated to the day, then the difference between time times + of the two timestamps is computed as if both times were truncated to the + millisecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def days_between( start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array: ... +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of days between two timestamps. + + Returns the number of day boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the day. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ hours_between = _clone_signature(days_between) +""" +Compute the number of hours between two timestamps. + +Returns the number of hour boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the hour. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" microseconds_between = _clone_signature(days_between) +""" +Compute the number of microseconds between two timestamps. + +Returns the number of microsecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the microsecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" milliseconds_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" minutes_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. +In [152]: print(pc.minutes_between.__doc__) +Compute the number of minute boundaries between two timestamps. + +Returns the number of minute boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the minute. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" def month_day_nano_interval_between( start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... -def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): ... +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: + """ + Compute the number of months, days and nanoseconds between two timestamps. + + Returns the number of months, days, and nanoseconds from `start` to `end`. + That is, first the difference in months is computed as if both timestamps + were truncated to the months, then the difference between the days + is computed, and finally the difference between the times of the two + timestamps is computed as if both times were truncated to the nanosecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of months between two timestamps. + + Returns the number of month boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the month. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ nanoseconds_between = _clone_signature(days_between) +""" +Compute the number of nanoseconds between two timestamps. + +Returns the number of nanosecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the nanosecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" quarters_between = _clone_signature(days_between) +""" +Compute the number of quarters between two timestamps. + +Returns the number of quarter start boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the quarter. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" seconds_between = _clone_signature(days_between) +""" +Compute the number of seconds between two timestamps. + +Returns the number of second boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the second. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" def weeks_between( start, @@ -2426,9 +6629,50 @@ def weeks_between( week_start: int = 1, options: DayOfWeekOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array: ... +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of weeks between two timestamps. + + Returns the number of week boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the week. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ years_between = _clone_signature(days_between) +""" +Compute the number of years between two timestamps. + +Returns the number of year boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the year. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 2.25 Timezone handling ========================= @overload @@ -2464,6 +6708,37 @@ def assume_timezone( options: AssumeTimezoneOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def assume_timezone(*args, **kwargs): + """ + Convert naive timestamp to timezone-aware timestamp. + + Input timestamps are assumed to be relative to the timezone given in the + `timezone` option. They are converted to UTC-relative timestamps and + the output type has its timezone set to the value of the `timezone` + option. Null values emit null. + This function is meant to be used when an external system produces + "timezone-naive" timestamps which need to be converted to + "timezone-aware" timestamps. An error is returned if the timestamps + already have a defined timezone. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + options : pyarrow.compute.AssumeTimezoneOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def local_timestamp( timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None @@ -2479,6 +6754,24 @@ def local_timestamp( def local_timestamp( timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None ) -> Expression: ... +def local_timestamp(*args, **kwargs): + """ + Convert timestamp to a timezone-naive local time timestamp. + + LocalTimestamp converts timezone-aware timestamp to local timestamp + of the given timestamp's timezone and removes timezone metadata. + Alternative name for this timestamp is also wall clock time. + If input is in UTC or without timezone, then unchanged input values + without timezone metadata are returned. + Null values emit null. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 2.26 Random number generation ========================= def random( @@ -2487,7 +6780,28 @@ def random( initializer: Literal["system"] | int = "system", options: RandomOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: ... +) -> lib.DoubleArray: + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 3. Array-wise (“vector”) functions ========================= @@ -2512,14 +6826,168 @@ def cumulative_sum( options: CumulativeSumOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def cumulative_sum(*args, **kwargs): + """ + Compute the cumulative sum over a numeric input. + + `values` must be numeric. Return an array/chunked array which is the + cumulative sum computed over `values`. Results will wrap around on + integer overflow. Use function "cumulative_sum_checked" if you want + overflow to return an error. The default start is 0. + + Parameters + ---------- + values : Array-like + Argument to compute function. + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ cumulative_sum_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative sum over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative sum computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_sum". The default start is 0. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cumulative_prod = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. Results will wrap around on +integer overflow. Use function "cumulative_prod_checked" if you want +overflow to return an error. The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cumulative_prod_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_prod". The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cumulative_max = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cumulative_min = _clone_signature(cumulative_sum) +""" +Compute the cumulative min over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative min computed over `values`. The default start is the maximum +value of input type (so that any other value will replace the +start as the new minimum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" cumulative_mean = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" # ========================= 3.2 Associative transforms ========================= @overload @@ -2617,6 +7085,45 @@ def drop_null( filter = array_filter take = array_take +""" +Select values (or records) from array- or table-like data given integer +selection indices. + +The result will be of the same type(s) as the input, with elements taken +from the input array (or record batch / table fields) at the given +indices. If an index is null then the corresponding value in the output +will be null. + +Parameters +---------- +data : Array, ChunkedArray, RecordBatch, or Table +indices : Array, ChunkedArray + Must be of integer type +boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. +memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +Returns +------- +result : depends on inputs + Selected values for the given indices + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array(["a", "b", "c", None, "e", "f"]) +>>> indices = pa.array([0, None, 4, 3]) +>>> arr.take(indices) + +[ + "a", + null, + "e", + null +] +""" # ========================= 3.4 Containment tests ========================= @overload @@ -2637,6 +7144,20 @@ def indices_nonzero( *, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def indices_nonzero(*args, **kwargs): + """ + Return the indices of the values in the array that are non-zero. + + For each input value, check if it's zero, false or null. Emit the index + of the value in the array if it's none of the those. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 3.5 Sorts and partitions ========================= @overload @@ -2659,6 +7180,34 @@ def array_sort_indices( options: ArraySortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def array_sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array. + + This function computes an array of indices that define a stable sort + of the input array. By default, Null values are considered greater + than any other value and are therefore sorted at the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in ArraySortOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.ArraySortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def partition_nth_indices( array: lib.Array | lib.ChunkedArray, @@ -2679,6 +7228,40 @@ def partition_nth_indices( options: PartitionNthOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def partition_nth_indices(*args, **kwargs): + """ + Return the indices that would partition an array around a pivot. + + This functions computes an array of indices that define a non-stable + partial sort of the input array. + + The output is such that the `N`'th index points to the `N`'th element + of the input in sorted order, and all indices before the `N`'th point + to elements in the input less or equal to elements at or after the `N`'th. + + By default, null values are considered greater than any other value + and are therefore partitioned towards the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The pivot index `N` must be given in PartitionNthOptions. + The handling of nulls and NaNs can also be changed in PartitionNthOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.PartitionNthOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def rank( input: lib.Array | lib.ChunkedArray, /, @@ -2688,7 +7271,50 @@ def rank( tiebreaker: Literal["min", "max", "first", "dense"] = "first", options: RankOptions | None = None, memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... +) -> lib.UInt64Array: + """ + Compute ordinal ranks of an array (1-based). + + This function computes a rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. The default tiebreaker is to assign ranks in order of + when ties appear in the input. + + The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + options : pyarrow.compute.RankOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def select_k_unstable( input: lib.Array | lib.ChunkedArray, @@ -2709,6 +7335,36 @@ def select_k_unstable( options: SelectKOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def select_k_unstable(*args, **kwargs): + """ + Select the indices of the first `k` ordered elements from the input. + + This function selects an array of indices of the first `k` ordered elements + from the `input` array, record batch or table specified in the column keys + (`options.sort_keys`). Output is not guaranteed to be stable. + Null values are considered greater than any other value and are + therefore ordered at the end. For floating-point types, NaNs are considered + greater than any other non-null value, but smaller than null values. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + options : pyarrow.compute.SelectKOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def sort_indices( input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, @@ -2729,6 +7385,36 @@ def sort_indices( options: SortOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array, record batch or table. + + This function computes an array of indices that define a stable sort + of the input array, record batch or table. By default, null values are + considered greater than any other value and are therefore sorted at the + end of the input. For floating-point types, NaNs are considered greater + than any other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in SortOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.SortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 3.6 Structural transforms ========================= @overload @@ -2759,6 +7445,24 @@ def list_element( *, memory_pool: lib.MemoryPool | None = None, ) -> _DataTypeT: ... +def list_element(*args, **kwargs): + """ + Compute elements using of nested list values using an index. + + `lists` must have a list-like type. + For each value in each list of `lists`, the element at `index` + is emitted. Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + index : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def list_flatten( lists: Expression, @@ -2777,6 +7481,32 @@ def list_flatten( options: ListFlattenOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.ListArray[Any]: ... +def list_flatten(*args, **kwargs): + """ + Flatten list values. + + `lists` must have a list-like type (lists, list-views, and + fixed-size lists). + Return an array with the top list level flattened unless + `recursive` is set to true in ListFlattenOptions. When that + is that case, flattening happens recursively until a non-list + array is formed. + + Null list values do not emit anything to the output. + + Parameters + ---------- + lists : Array-like + Argument to compute function. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + options : pyarrow.compute.ListFlattenOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def list_parent_indices( lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None @@ -2785,6 +7515,22 @@ def list_parent_indices( def list_parent_indices( lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None ) -> lib.Int64Array: ... +def list_parent_indices(*args, **kwargs): + """ + Compute parent indices of nested list values. + + `lists` must have a list-like or list-view type. + For each value in each list of `lists`, the top-level list index + is emitted. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + @overload def list_slice( lists: Expression, @@ -2809,6 +7555,36 @@ def list_slice( options: ListSliceOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> lib.ListArray[Any]: ... +def list_slice(*args, **kwargs): + """ + Compute slice of list-like array. + + `lists` must have a list-like type. + For each list element, compute a slice, returning a new list array. + A variable or fixed size list array is returned, depending on options. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + options : pyarrow.compute.ListSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def map_lookup( container, /, @@ -2817,7 +7593,29 @@ def map_lookup( *, options: MapLookupOptions | None = None, memory_pool: lib.MemoryPool | None = None, -): ... +): + """ + Find the items corresponding to a given key in a Map. + + For a given query key (passed via MapLookupOptions), extract + either the FIRST, LAST or ALL items from a Map that have + matching keys. + + Parameters + ---------- + container : Array-like or scalar-like + Argument to compute function. + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + options : pyarrow.compute.MapLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def struct_field( values, /, @@ -2825,9 +7623,63 @@ def struct_field( *, options: StructFieldOptions | None = None, memory_pool: lib.MemoryPool | None = None, -): ... -def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... -def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): ... +): + """ + Extract children of a struct or union by index. + + Given a list of indices (passed via StructFieldOptions), extract + the child array or scalar with the given child index, recursively. + + For union inputs, nulls are emitted for union values that reference + a different child than specified. Also, the indices are always + in physical order, not logical type codes - for example, the first + child is always index 0. + + An empty list of indices returns the argument unchanged. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + options : pyarrow.compute.StructFieldOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values backward to fill null slots. + + Given an array, propagate next valid observation backward to previous valid + or nothing if all next values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values forward to fill null slots. + + Given an array, propagate last valid observation forward to next valid + or nothing if all previous values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + def replace_with_mask( values, mask: list[bool] | list[bool | None] | BooleanArray, @@ -2835,7 +7687,28 @@ def replace_with_mask( /, *, memory_pool: lib.MemoryPool | None = None, -): ... +): + """ + Replace items selected with a mask. + + Given an array and a boolean mask (either scalar or of equal length), + along with replacement values (either scalar or array), + each element of the array for which the corresponding mask element is + true will be replaced by the next value from the replacements, + or with null if the mask is null. + Hence, for replacement arrays, len(replacements) == sum(mask == true). + + Parameters + ---------- + values : Array-like + Argument to compute function. + mask : Array-like + Argument to compute function. + replacements : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ # ========================= 3.7 Pairwise functions ========================= @overload @@ -2856,5 +7729,51 @@ def pairwise_diff( options: PairwiseOptions | None = None, memory_pool: lib.MemoryPool | None = None, ) -> Expression: ... +def pairwise_diff(*args, **kwargs): + """ + Compute first order difference of an array. + + Computes the first order difference of an array, It internally calls + the scalar function "subtract" to compute + differences, so its + behavior and supported types are the same as + "subtract". The period can be specified in :struct:`PairwiseOptions`. + + Results will wrap around on integer overflow. Use function + "pairwise_diff_checked" if you want overflow to return an error. + + Parameters + ---------- + input : Array-like + Argument to compute function. + period : int, default 1 + Period for applying the period function. + options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ pairwise_diff_checked = _clone_signature(pairwise_diff) +""" +Compute first order difference of an array. + +Computes the first order difference of an array, It internally calls +the scalar function "subtract_checked" (or the checked variant) to compute +differences, so its behavior and supported types are the same as +"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. + +This function returns an error on overflow. For a variant that doesn't +fail on overflow, use function "pairwise_diff". + +Parameters +---------- +input : Array-like + Argument to compute function. +period : int, default 1 + Period for applying the period function. +options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" diff --git a/pyarrow-stubs/interchange/buffer.pyi b/pyarrow-stubs/interchange/buffer.pyi index 50bbd3a1238..46673961a75 100644 --- a/pyarrow-stubs/interchange/buffer.pyi +++ b/pyarrow-stubs/interchange/buffer.pyi @@ -3,6 +3,8 @@ import enum from pyarrow.lib import Buffer class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + CPU = 1 CUDA = 2 CPU_PINNED = 3 @@ -13,10 +15,44 @@ class DlpackDeviceType(enum.IntEnum): ROCM = 10 class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... @property - def bufsize(self) -> int: ... + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ @property - def ptr(self) -> int: ... - def __dlpack__(self): ... - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ diff --git a/pyarrow-stubs/interchange/column.pyi b/pyarrow-stubs/interchange/column.pyi index fd6600a604b..e6662867b6b 100644 --- a/pyarrow-stubs/interchange/column.pyi +++ b/pyarrow-stubs/interchange/column.pyi @@ -7,6 +7,27 @@ from pyarrow.lib import Array, ChunkedArray from .buffer import _PyArrowBuffer class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + INT = 0 UINT = 1 FLOAT = 2 @@ -18,6 +39,23 @@ class DtypeKind(enum.IntEnum): Dtype: TypeAlias = tuple[DtypeKind, int, str, str] class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 @@ -40,23 +78,175 @@ class Endianness(enum.Enum): NATIVE = "=" NA = "|" -class NoBufferPresent(Exception): ... +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... - def size(self) -> int: ... + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ @property - def offset(self) -> int: ... + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ @property - def dtype(self) -> tuple[DtypeKind, int, str, str]: ... + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ @property - def describe_categorical(self) -> CategoricalDescription: ... + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ @property - def describe_null(self) -> tuple[ColumnNullType, Any]: ... + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ @property - def null_count(self) -> int: ... + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ @property - def metadata(self) -> dict[str, Any]: ... - def num_chunks(self) -> int: ... - def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... - def get_buffers(self) -> ColumnBuffers: ... + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ diff --git a/pyarrow-stubs/interchange/dataframe.pyi b/pyarrow-stubs/interchange/dataframe.pyi index 91cb6e70d7a..526a58926a9 100644 --- a/pyarrow-stubs/interchange/dataframe.pyi +++ b/pyarrow-stubs/interchange/dataframe.pyi @@ -10,21 +10,93 @@ from pyarrow.interchange.column import _PyArrowColumn from pyarrow.lib import RecordBatch, Table class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + def __init__( self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True ) -> None: ... def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: ... + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ @property - def metadata(self) -> dict[str, Any]: ... - def num_columns(self) -> int: ... - def num_rows(self) -> int: ... - def num_chunks(self) -> int: ... - def column_names(self) -> Iterable[str]: ... - def get_column(self, i: int) -> _PyArrowColumn: ... - def get_column_by_name(self, name: str) -> _PyArrowColumn: ... - def get_columns(self) -> Iterable[_PyArrowColumn]: ... - def select_columns(self, indices: Sequence[int]) -> Self: ... - def select_columns_by_name(self, names: Sequence[str]) -> Self: ... - def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + def select_columns(self, indices: Sequence[int]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + def select_columns_by_name(self, names: Sequence[str]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ diff --git a/pyarrow-stubs/interchange/from_dataframe.pyi b/pyarrow-stubs/interchange/from_dataframe.pyi index 352bead7f25..b04b6268975 100644 --- a/pyarrow-stubs/interchange/from_dataframe.pyi +++ b/pyarrow-stubs/interchange/from_dataframe.pyi @@ -14,15 +14,126 @@ class DataFrameObject(Protocol): ColumnObject: TypeAlias = Any -def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... -def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... -def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... -def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_attendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... } + ... ) + >>> df + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_attendees: int64 + country: large_string + ---- + n_attendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + """ + +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + def categorical_column_to_dictionary( col: ColumnObject, allow_copy: bool = True -) -> DictionaryArray: ... -def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... -def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... +) -> DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: + """Parse datetime `format_str` to interpret the `data`.""" + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: + """Map column date type to pyarrow date type.""" + def buffers_to_array( buffers: ColumnBuffers, data_type: tuple[DtypeKind, int, str, str], @@ -30,7 +141,40 @@ def buffers_to_array( describe_null: ColumnNullType, offset: int = 0, allow_copy: bool = True, -) -> Array: ... +) -> Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + def validity_buffer_from_mask( validity_buff: Buffer, validity_dtype: Dtype, @@ -38,7 +182,33 @@ def validity_buffer_from_mask( length: int, offset: int = 0, allow_copy: bool = True, -) -> Buffer: ... +) -> Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + def validity_buffer_nan_sentinel( data_pa_buffer: Buffer, data_type: Dtype, @@ -46,4 +216,29 @@ def validity_buffer_nan_sentinel( length: int, offset: int = 0, allow_copy: bool = True, -) -> Buffer: ... +) -> Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi index d6b6f7e51c7..1698b55520b 100644 --- a/pyarrow-stubs/lib.pyi +++ b/pyarrow-stubs/lib.pyi @@ -22,9 +22,45 @@ class MonthDayNano(NamedTuple): months: int nanoseconds: int -def cpu_count() -> int: ... -def set_cpu_count(count: int) -> None: ... -def is_threading_enabled() -> bool: ... +def cpu_count() -> int: + """ + Return the number of threads to use in parallel operations. + + The number of threads is determined at startup by inspecting the + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. + + See Also + -------- + set_cpu_count : Modify the size of this pool. + io_thread_count : The analogous function for the I/O thread pool. + """ + +def set_cpu_count(count: int) -> None: + """ + Set the number of threads to use in parallel operations. + + Parameters + ---------- + count : int + The number of concurrent threads that should be used. + + See Also + -------- + cpu_count : Get the size of this pool. + set_io_thread_count : The analogous function for the I/O thread pool. + """ + +def is_threading_enabled() -> bool: + """ + Returns True if threading is enabled in libarrow. + + If it isn't enabled, then python shouldn't create any + threads either, because we're probably on a system where + threading doesn't work (e.g. Emscripten). + """ Type_NA: int Type_BOOL: int diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index a4696a69297..2eba8d40a11 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -13,46 +13,149 @@ from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + reader: _orc.ORCReader def __init__(self, source: StrPath | NativeFile | IO) -> None: ... @property - def metadata(self) -> KeyValueMetadata: ... + def metadata(self) -> KeyValueMetadata: + """The file metadata, as an arrow KeyValueMetadata""" @property - def schema(self) -> Schema: ... + def schema(self) -> Schema: + """The file schema, as an arrow schema""" @property - def nrows(self) -> int: ... + def nrows(self) -> int: + """The number of rows in the file""" @property - def nstripes(self) -> int: ... + def nstripes(self) -> int: + """The number of stripes in the file""" @property - def file_version(self) -> str: ... + def file_version(self) -> str: + """Format version of the ORC file, must be 0.11 or 0.12""" @property - def software_version(self) -> str: ... + def software_version(self) -> str: + """Software instance and version that wrote this file""" @property - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: + """Compression codec of the file""" @property - def compression_size(self) -> int: ... + def compression_size(self) -> int: + """Number of bytes to buffer for the compression codec in the file""" @property - def writer(self) -> str: ... + def writer(self) -> str: + """Name of the writer that wrote this file. + If the writer is unknown then its Writer ID + (a number) is returned""" @property - def writer_version(self) -> str: ... + def writer_version(self) -> str: + """Version of the writer""" @property - def row_index_stride(self) -> int: ... + def row_index_stride(self) -> int: + """Number of rows per an entry in the row index or 0 + if there is no row index""" @property - def nstripe_statistics(self) -> int: ... + def nstripe_statistics(self) -> int: + """Number of stripe statistics""" @property - def content_length(self) -> int: ... + def content_length(self) -> int: + """Length of the data stripes in the file in bytes""" @property - def stripe_statistics_length(self) -> int: ... + def stripe_statistics_length(self) -> int: + """The number of compressed bytes in the file stripe statistics""" @property - def file_footer_length(self) -> int: ... + def file_footer_length(self) -> int: + """The number of compressed bytes in the file footer""" @property - def file_postscript_length(self) -> int: ... + def file_postscript_length(self) -> int: + """The number of bytes in the file postscript""" @property - def file_length(self) -> int: ... - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... - def read(self, columns: list[str] | None = None) -> Table: ... + def file_length(self) -> int: + """The number of bytes in the file""" + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.RecordBatch + Content of the stripe as a RecordBatch. + """ + def read(self, columns: list[str] | None = None) -> Table: + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. Output always follows the + ordering of the file and not the `columns` list. + + Returns + ------- + pyarrow.Table + Content of the file as a Table. + """ class ORCWriter: + """ + Writer interface for a single ORC file + + Parameters + ---------- + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ + writer: _orc.ORCWriter is_open: bool def __init__( @@ -73,14 +176,48 @@ class ORCWriter: ): ... def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... - def write(self, table: Table) -> None: ... - def close(self) -> None: ... + def write(self, table: Table) -> None: + """ + Write the table into an ORC file. The schema of the table must + be equal to the schema used when opening the ORC file. + + Parameters + ---------- + table : pyarrow.Table + The table to be written into the ORC file + """ + def close(self) -> None: + """ + Close the ORC file + """ def read_table( source: StrPath | NativeFile | IO, columns: list[str] | None = None, filesystem: SupportedFileSystem | None = None, -) -> Table: ... +) -> Table: + """ + Read a Table from an ORC file. + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name. For file-like objects, + only read a single file. Use pyarrow.BufferReader to read a file + contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. Output always follows the ordering of the file and + not the `columns` list. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + """ + def write_table( table: Table, where: StrPath | NativeFile | IO, @@ -96,4 +233,47 @@ def write_table( dictionary_key_size_threshold: float = 0.0, bloom_filter_columns: list[int] | None = None, bloom_filter_fpp: float = 0.05, -) -> None: ... +) -> None: + """ + Write a table into an ORC file. + + Parameters + ---------- + table : pyarrow.lib.Table + The table to be written into the ORC file + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 8d665416066..56b2c8447d9 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -60,13 +60,127 @@ __all__ = ( "filters_to_expression", ) -def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: + """ + Check if filters are well-formed and convert to an ``Expression``. + + Parameters + ---------- + filters : List[Tuple] or List[List[Tuple]] + + Notes + ----- + See internal ``pyarrow._DNF_filter_doc`` attribute for more details. + + Examples + -------- + + >>> filters_to_expression([("foo", "==", "bar")]) + + + Returns + ------- + pyarrow.compute.Expression + An Expression representing the filters + """ + @deprecated("use filters_to_expression") def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... _Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] class ParquetFile: + """ + Reader interface for a single Parquet file. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarrow.BufferReader. + metadata : FileMetaData, default None + Use existing metadata object, rather than reading from file. + common_metadata : FileMetaData, default None + Will be used in reads for pandas schema metadata if not found in the + main file's metadata, no other uses at the moment. + read_dictionary : list + List of column names to read directly as DictionaryArray. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + pre_buffer : bool, default False + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties, default None + File decryption properties for Parquet Modular Encryption. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Examples + -------- + + Generate an example PyArrow Table and write it to Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Create a ``ParquetFile`` object from the Parquet file: + + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the data: + + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Create a ParquetFile object with "animal" column as DictionaryArray: + + >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [ -- dictionary: + ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: + [0,1,2,3,4,5]] + """ + reader: ParquetReader common_metadata: FileMetaData @@ -90,13 +204,63 @@ class ParquetFile: def __enter__(self) -> Self: ... def __exit__(self, *args, **kwargs) -> None: ... @property - def metadata(self) -> FileMetaData: ... + def metadata(self) -> FileMetaData: + """ + Return the Parquet metadata. + """ @property - def schema(self) -> ParquetSchema: ... + def schema(self) -> ParquetSchema: + """ + Return the Parquet schema, unconverted to Arrow types + """ @property - def schema_arrow(self) -> Schema: ... + def schema_arrow(self) -> Schema: + """ + Return the inferred Arrow schema, converted from the whole Parquet + file's schema + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the Arrow schema: + + >>> parquet_file.schema_arrow + n_legs: int64 + animal: string + """ @property - def num_row_groups(self) -> int: ... + def num_row_groups(self) -> int: + """ + Return the number of row groups of the Parquet file. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.num_row_groups + 1 + """ def close(self, force: bool = False) -> None: ... @property def closed(self) -> bool: ... @@ -106,14 +270,100 @@ class ParquetFile: columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: ... + ) -> Table: + """ + Read a single row group from a Parquet file. + + Parameters + ---------- + i : int + Index of the individual row group that we want to read. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row group as a table (of columns) + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_group(0) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ def read_row_groups( self, row_groups: list, columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: ... + ) -> Table: + """ + Read a multiple row groups from a Parquet file. + + Parameters + ---------- + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row groups as a table (of columns). + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_groups([0, 0]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] + """ def iter_batches( self, batch_size: int = 65536, @@ -121,16 +371,375 @@ class ParquetFile: columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Iterator[RecordBatch]: ... + ) -> Iterator[RecordBatch]: + """ + Read streaming batches from a Parquet file. + + Parameters + ---------- + batch_size : int, default 64K + Maximum number of records to yield per batch. Batches may be + smaller if there aren't enough rows in the file. + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : boolean, default True + Perform multi-threaded column reads. + use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Yields + ------ + pyarrow.RecordBatch + Contents of each batch as a record batch + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + >>> for i in parquet_file.iter_batches(): + ... print("RecordBatch") + ... print(i.to_pandas()) + RecordBatch + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ def read( self, columns: list | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: ... - def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: ... + ) -> Table: + """ + Read a Table from Parquet format. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read a Table: + + >>> parquet_file.read(columns=["animal"]) + pyarrow.Table + animal: string + ---- + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: + """ + Read contents of file for the given columns and batch size. + + Notes + ----- + This function's primary purpose is benchmarking. + The scan is executed on a single thread. + + Parameters + ---------- + columns : list of integers, default None + Select columns to read, if None scan all columns. + batch_size : int, default 64K + Number of rows to read at a time internally. + + Returns + ------- + num_rows : int + Number of rows in file + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.scan_contents() + 6 + """ class ParquetWriter: + """ + Class for incrementally building a Parquet file for Arrow tables. + + Parameters + ---------- + where : path or file-like object + schema : pyarrow.Schema + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + writer_engine_version : unused + **options : dict + If options contains a key `metadata_collector` then the + corresponding value is assumed to be a list (or any object with + `.append` method) that will be filled with the file metadata instance + of the written file. + + Examples + -------- + Generate an example PyArrow Table and RecordBatch: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.record_batch( + ... [ + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... ], + ... names=["n_legs", "animal"], + ... ) + + create a ParquetWriter object: + + >>> import pyarrow.parquet as pq + >>> writer = pq.ParquetWriter("example.parquet", table.schema) + + and write the Table into the Parquet file: + + >>> writer.write_table(table) + >>> writer.close() + + >>> pq.read_table("example.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + create a ParquetWriter object for the RecordBatch: + + >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) + + and write the RecordBatch into the Parquet file: + + >>> writer2.write_batch(batch) + >>> writer2.close() + + >>> pq.read_table("example2.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + flavor: str schema_changed: bool schema: ParquetSchema @@ -170,13 +779,210 @@ class ParquetWriter: def __exit__(self, *args, **kwargs) -> Literal[False]: ... def write( self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None - ) -> None: ... - def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: ... - def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... - def close(self) -> None: ... - def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... + ) -> None: + """ + Write RecordBatch or Table to the Parquet file. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the input + table or batch length and 1024 * 1024. + """ + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: + """ + Write RecordBatch to the Parquet file. + + Parameters + ---------- + batch : RecordBatch + row_group_size : int, default None + Maximum number of rows in written row group. If None, the + row group size will be the minimum of the RecordBatch + size and 1024 * 1024. If set larger than 64Mi then 64Mi + will be used instead. + """ + def write_table(self, table: Table, row_group_size: int | None = None) -> None: + """ + Write Table to the Parquet file. + + Parameters + ---------- + table : Table + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the Table size + and 1024 * 1024. If set larger than 64Mi then 64Mi will + be used instead. + + """ + def close(self) -> None: + """ + Close the connection to the Parquet file. + """ + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: + """ + Add key-value metadata to the file. + This will overwrite any existing metadata with the same key. + + Parameters + ---------- + key_value_metadata : dict + Keys and values must be string-like / coercible to bytes. + """ class ParquetDataset: + """ + Encapsulates details of reading a complete Parquet dataset possibly + consisting of multiple files and partitions in subdirectories. + + Parameters + ---------- + path_or_paths : str or List[str] + A directory name, single file name, or list of file names. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : pyarrow.parquet.Schema + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular resolution + (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 + timestamps will be inferred as timestamps in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + + Examples + -------- + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) + + create a ParquetDataset object from the dataset source: + + >>> dataset = pq.ParquetDataset("dataset_v2/") + + and read the data: + + >>> dataset.read().to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + create a ParquetDataset object with filter: + + >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) + >>> dataset.read().to_pandas() + n_legs animal year + 0 4 Dog 2021 + 1 4 Horse 2022 + """ def __init__( self, path_or_paths: SingleOrList[str] @@ -201,22 +1007,184 @@ class ParquetDataset: ): ... def equals(self, other: ParquetDataset) -> bool: ... @property - def schema(self) -> Schema: ... + def schema(self) -> Schema: + """ + Schema of the Dataset. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_schema/") + + Read the schema: + + >>> dataset.schema + n_legs: int64 + animal: string + year: dictionary + """ def read( self, columns: list[str] | None = None, use_threads: bool = True, use_pandas_metadata: bool = False, - ) -> Table: ... - def read_pandas(self, **kwargs) -> Table: ... + ) -> Table: + """ + Read (multiple) Parquet files as a single pyarrow.Table. + + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. The partition fields + are not automatically included. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_read/") + + Read the dataset: + + >>> dataset.read(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[5],[2],[4,100],[2,4]] + """ + def read_pandas(self, **kwargs) -> Table: + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` + + Examples + -------- + Generate an example parquet file: + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "table_V2.parquet") + >>> dataset = pq.ParquetDataset("table_V2.parquet") + + Read the dataset with pandas metadata: + + >>> dataset.read_pandas(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,2,4,4,5,100]] + + >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} + """ @property - def fragments(self) -> list[ParquetFileFragment]: ... + def fragments(self) -> list[ParquetFileFragment]: + """ + A list of the Dataset source fragments or pieces with absolute + file paths. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") + + List the fragments: + + >>> dataset.fragments + [ list[str]: ... + def files(self) -> list[str]: + """ + A list of absolute Parquet file paths in the Dataset source. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_files/") + + List the files: + + >>> dataset.files + ['dataset_v2_files/year=2019/...-0.parquet', ... + """ @property - def filesystem(self) -> FileSystem: ... + def filesystem(self) -> FileSystem: + """ + The filesystem type of the Dataset source. + """ @property - def partitioning(self) -> Partitioning: ... + def partitioning(self) -> Partitioning: + """ + The partitioning of the Dataset source, if discovered. + """ def read_table( source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], @@ -238,10 +1206,348 @@ def read_table( thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, page_checksum_verification: bool = False, -) -> Table: ... +) -> Table: + """ + Read a Table from Parquet format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + + + Examples + -------- + + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) + + Read the data: + + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + + Read only a subset of columns: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] + + Read a subset of columns and read one column as DictionaryArray: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [ -- dictionary: + ["Brittle stars"] -- indices: + [0], -- dictionary: + ["Flamingo"] -- indices: + [0], -- dictionary: + ["Dog","Centipede"] -- indices: + [0,1], -- dictionary: + ["Parrot","Horse"] -- indices: + [0,1]] + + Read the table with filter: + + >>> pq.read_table( + ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] + ... ).to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + + Read data from a single Parquet file: + + >>> pq.write_table(table, "example.parquet") + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + """ + def read_pandas( source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs -) -> Table: ... +) -> Table: + """ + + Read a Table from Parquet format, also reading DataFrame + index values if known in the file metadata + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + **kwargs + additional options for :func:`read_table` + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns + """ + def write_table( table: Table, where: str | Path | NativeFile | IO, @@ -270,7 +1576,222 @@ def write_table( sorting_columns: Sequence[SortingColumn] | None = None, store_decimal_as_integer: bool = False, **kwargs, -) -> None: ... +) -> None: + """ + + Write a Table to Parquet format. + + Parameters + ---------- + table : pyarrow.Table + where : string or pyarrow.NativeFile + row_group_size : int + Maximum number of rows in each written row group. If None, the + row group size will be the minimum of the Table size and + 1024 * 1024. + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + + **kwargs : optional + Additional options for ParquetWriter + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write the Table into Parquet file: + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Defining row group size for the Parquet file: + + >>> pq.write_table(table, "example.parquet", row_group_size=3) + + Defining row group compression (default is Snappy): + + >>> pq.write_table(table, "example.parquet", compression="none") + + Defining row group compression and encoding per-column: + + >>> pq.write_table( + ... table, + ... "example.parquet", + ... compression={"n_legs": "snappy", "animal": "gzip"}, + ... use_dictionary=["n_legs", "animal"], + ... ) + + Defining column encoding per-column: + + >>> pq.write_table( + ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False + ... ) + """ + def write_to_dataset( table: Table, root_path: str | Path, @@ -284,23 +1805,257 @@ def write_to_dataset( existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] | None = None, **kwargs, -) -> None: ... +) -> None: + """ + Wrapper around dataset.write_dataset for writing a Table to + Parquet format by partitions. + For each combination of partition columns and values, + a subdirectories are created in the following + manner: + + root_dir/ + group1=value1 + group2=value1 + .parquet + group2=value2 + .parquet + group1=valueN + group2=value1 + .parquet + group2=valueN + .parquet + + Parameters + ---------- + table : pyarrow.Table + root_path : str, pathlib.Path + The root directory of the dataset. + partition_cols : list, + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : Schema, optional + This Schema of the dataset. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the + ``pyarrow.dataset.partitioning()`` function or a list of field names. + When providing a list of field names, you can use + ``partitioning_flavor`` to drive which partitioning type should be + used. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to "guid-{i}.parquet". + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + + existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behaviour is 'overwrite_or_ignore'. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'error' will raise an error if any data exists in the destination. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + **kwargs : dict, + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. + Using `metadata_collector` in kwargs allows one to collect the + file metadata instances of dataset pieces. The file paths in the + ColumnChunkMetaData will be set relative to `root_path`. + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write it to a partitioned dataset: + + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) + >>> pq.ParquetDataset("dataset_name_3").files + ['dataset_name_3/year=2019/...-0.parquet', ... + + Write a single Parquet file into the root folder: + + >>> pq.write_to_dataset(table, root_path="dataset_name_4") + >>> pq.ParquetDataset("dataset_name_4/").files + ['dataset_name_4/...-0.parquet'] + """ + def write_metadata( schema: Schema, where: str | NativeFile, metadata_collector: list[FileMetaData] | None = None, filesystem: SupportedFileSystem | None = None, **kwargs, -) -> None: ... +) -> None: + """ + Write metadata-only Parquet file from schema. This can be used with + `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar + files. + + Parameters + ---------- + schema : pyarrow.Schema + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + **kwargs : dict, + Additional kwargs for ParquetWriter class. See docstring for + `ParquetWriter` for more information. + + Examples + -------- + Generate example data: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Write a dataset and collect metadata information. + + >>> metadata_collector = [] + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) + + Write the `_common_metadata` parquet file without row groups statistics. + + >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") + + Write the `_metadata` parquet file with row groups statistics. + + >>> pq.write_metadata( + ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector + ... ) + """ + def read_metadata( where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, -) -> FileMetaData: ... +) -> FileMetaData: + """ + Read FileMetaData from footer of a single Parquet file. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + metadata : FileMetaData + The metadata of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_metadata("example.parquet") + + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + """ + def read_schema( where: str | Path | IO | NativeFile, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, filesystem: SupportedFileSystem | None = None, -) -> Schema: ... +) -> Schema: + """ + Read effective Arrow schema from Parquet file metadata. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + schema : pyarrow.Schema + The schema of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_schema("example.parquet") + n_legs: int64 + animal: string + """ diff --git a/pyproject.toml b/pyproject.toml index e88896cffcc..4ec173558b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ extend-select = [ ] ignore = [ "PYI011", # typed-argument-default-in-stub + "PYI021", # docstring-in-stub "PYI015", # assignment-default-in-stub "PYI063", # pep484-style-positional-only-parameter "N818", # error-suffix-on-exception-name From 42090e0a44c76a14463f67cccef2271ab0a42ecf Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 18 Jun 2025 10:27:23 +0800 Subject: [PATCH 206/231] release 20.0.0.20250618 (#243) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 8413cfd18f8..def9519d0a6 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1281,8 +1281,8 @@ packages: requires_python: '>=3.9' - pypi: ./ name: pyarrow-stubs - version: 20.0.0b0 - sha256: 4ae4d2484afd306b8d131ce4bed1faa48493d8ace8b43b731b811b4e4e6bd2e2 + version: 20.0.0.20250618 + sha256: d932d63160504f6b843d822a80464b720f3d39660e8167d1e074bc4a58f71f9d requires_dist: - pyarrow>=20 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 4ec173558b6..0756d8c4121 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20.0.0b0" +version = "20.0.0.20250618" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From d7041d39688c79cd26d85fd887e4f171d74d2d4f Mon Sep 17 00:00:00 2001 From: Tom McTiernan Date: Fri, 27 Jun 2025 04:10:12 +0100 Subject: [PATCH 207/231] fix: make ParquetFileFormat constructor args optional (#244) --- pyarrow-stubs/_dataset_parquet.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index 2e9edda57f7..cbcc17235f1 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -36,8 +36,8 @@ class ParquetFileFormat(FileFormat): """ def __init__( self, - read_options: ParquetReadOptions, - default_fragment_scan_options: ParquetFragmentScanOptions, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, **kwargs, ) -> None: ... @property From 8d0ebd8e411362b029347b46b726b14f1f80c533 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 27 Jun 2025 11:10:29 +0800 Subject: [PATCH 208/231] fix: Field.remove_metadata should return Self (#246) --- pyarrow-stubs/__lib_pxi/types.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi index 414e9ff71c4..7fe6c36e332 100644 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ b/pyarrow-stubs/__lib_pxi/types.pyi @@ -1674,7 +1674,7 @@ class Field(_Weakrefable, Generic[_DataTypeT]): >>> field_new.metadata {b'key': b'Something important'} """ - def remove_metadata(self) -> None: + def remove_metadata(self) -> Self: """ Create new field without metadata, if any From dd933eb7166ff9ebe8cb6672aef1602e2061b480 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Jun 2025 11:10:42 +0800 Subject: [PATCH 209/231] [pre-commit.ci] pre-commit autoupdate (#245) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.13 → v0.12.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.13...v0.12.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f0b1130198d..c8bc4aca5ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.13 + rev: v0.12.0 hooks: - id: ruff args: [--fix] From d4fbd25850207531ff26902d8d4bc134ee5ccd79 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Fri, 27 Jun 2025 11:14:35 +0800 Subject: [PATCH 210/231] release 20.0.0.20250627 (#247) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index def9519d0a6..f04a6778c28 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1281,8 +1281,8 @@ packages: requires_python: '>=3.9' - pypi: ./ name: pyarrow-stubs - version: 20.0.0.20250618 - sha256: d932d63160504f6b843d822a80464b720f3d39660e8167d1e074bc4a58f71f9d + version: 20.0.0.20250627 + sha256: 4bd75da39a925aa9803fd71a32e306874c27960a69a2b8ccf5681835f916b2af requires_dist: - pyarrow>=20 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 0756d8c4121..da56b5e5600 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20.0.0.20250618" +version = "20.0.0.20250627" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From b8ecaea009fe2a77235a74410979cc4c0a00f417 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 15 Jul 2025 18:01:51 +0800 Subject: [PATCH 211/231] fix: chunked_array with type should be specified (#250) --- pyarrow-stubs/__lib_pxi/table.pyi | 83 +++++++++++++++---------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 97a6ede39d9..1ccbf67dab8 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -1519,185 +1519,180 @@ def chunked_array( type: None = None, ) -> ChunkedArray[scalar.ListScalar[Any]]: ... @overload -def chunked_array( - values: Iterable[Array[_ScalarT]], - type: None = None, -) -> ChunkedArray[_ScalarT]: ... -@overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: _DataTypeT, -) -> ChunkedArray[Scalar[_DataTypeT]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["null"], + type: Literal["null"] | types.NullType, ) -> ChunkedArray[scalar.NullScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["bool", "boolean"], + type: Literal["bool", "boolean"] | types.BoolType, ) -> ChunkedArray[scalar.BooleanScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i1", "int8"], + type: Literal["i1", "int8"] | types.Int8Type, ) -> ChunkedArray[scalar.Int8Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i2", "int16"], + type: Literal["i2", "int16"] | types.Int16Type, ) -> ChunkedArray[scalar.Int16Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i4", "int32"], + type: Literal["i4", "int32"] | types.Int32Type, ) -> ChunkedArray[scalar.Int32Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i8", "int64"], + type: Literal["i8", "int64"] | types.Int64Type, ) -> ChunkedArray[scalar.Int64Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u1", "uint8"], + type: Literal["u1", "uint8"] | types.UInt8Type, ) -> ChunkedArray[scalar.UInt8Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u2", "uint16"], + type: Literal["u2", "uint16"] | types.UInt16Type, ) -> ChunkedArray[scalar.UInt16Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u4", "uint32"], + type: Literal["u4", "uint32"] | types.Uint32Type, ) -> ChunkedArray[scalar.UInt32Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u8", "uint64"], + type: Literal["u8", "uint64"] | types.UInt64Type, ) -> ChunkedArray[scalar.UInt64Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f2", "halffloat", "float16"], + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, ) -> ChunkedArray[scalar.HalfFloatScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f4", "float", "float32"], + type: Literal["f4", "float", "float32"] | types.Float32Type, ) -> ChunkedArray[scalar.FloatScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f8", "double", "float64"], + type: Literal["f8", "double", "float64"] | types.Float64Type, ) -> ChunkedArray[scalar.DoubleScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string", "str", "utf8"], + type: Literal["string", "str", "utf8"] | types.StringType, ) -> ChunkedArray[scalar.StringScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary"], + type: Literal["binary"] | types.BinaryType, ) -> ChunkedArray[scalar.BinaryScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_string", "large_str", "large_utf8"], + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, ) -> ChunkedArray[scalar.LargeStringScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_binary"], + type: Literal["large_binary"] | types.LargeBinaryType, ) -> ChunkedArray[scalar.LargeBinaryScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary_view"], + type: Literal["binary_view"] | types.BinaryViewType, ) -> ChunkedArray[scalar.BinaryViewScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string_view"], + type: Literal["string_view"] | types.StringViewType, ) -> ChunkedArray[scalar.StringViewScalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date32", "date32[day]"], + type: Literal["date32", "date32[day]"] | types.Date32Type, ) -> ChunkedArray[scalar.Date32Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date64", "date64[ms]"], + type: Literal["date64", "date64[ms]"] | types.Date64Type, ) -> ChunkedArray[scalar.Date64Scalar]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[s]"], + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], ) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[ms]"], + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], ) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[us]"], + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], ) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[ns]"], + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], ) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[s]"], + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], ) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ms]"], + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], ) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[us]"], + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], ) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ns]"], + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], ) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[s]"], + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], ) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ms]"], + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], ) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[us]"], + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], ) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ns]"], + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], ) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... @overload def chunked_array( values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, - type: Literal["month_day_nano_interval"], + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, ) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[Array[_ScalarT]], + type: None = None, +) -> ChunkedArray[_ScalarT]: ... def chunked_array(value, type=None): """ Construct chunked array from list of array-like objects From 4f61d867fc705632486462c072aebe6c69271882 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:02:07 +0800 Subject: [PATCH 212/231] [pre-commit.ci] pre-commit autoupdate (#248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.0 → v0.12.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.0...v0.12.3) - [github.com/RobertCraigie/pyright-python: v1.1.402 → v1.1.403](https://github.com/RobertCraigie/pyright-python/compare/v1.1.402...v1.1.403) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8bc4aca5ec..fa58a732dfd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,13 +21,13 @@ repos: - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.0 + rev: v0.12.3 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.402 + rev: v1.1.403 hooks: - id: pyright From 6e23b7eedf23becd7cb7baa6732b10da1427ee2c Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Tue, 15 Jul 2025 18:04:31 +0800 Subject: [PATCH 213/231] release 20.0.0.20250715 (#251) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index f04a6778c28..d4db2971501 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1281,8 +1281,8 @@ packages: requires_python: '>=3.9' - pypi: ./ name: pyarrow-stubs - version: 20.0.0.20250627 - sha256: 4bd75da39a925aa9803fd71a32e306874c27960a69a2b8ccf5681835f916b2af + version: 20.0.0.20250715 + sha256: 61d7f3aca105acaa3d003ccae176470daa6fe9fd13f84aa99381aaa9762df3f1 requires_dist: - pyarrow>=20 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index da56b5e5600..284debd7251 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20.0.0.20250627" +version = "20.0.0.20250715" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 1830b4a292700ea61a8bc07eeecd87b3ea478938 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 16 Jul 2025 10:25:06 +0800 Subject: [PATCH 214/231] fix: The type parameter of array should be covariant (#253) --- pyarrow-stubs/__lib_pxi/array.pyi | 9 +++++---- pyarrow-stubs/__lib_pxi/scalar.pyi | 5 +++-- pyarrow-stubs/__lib_pxi/table.pyi | 15 ++++++++------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi index a6152610241..ec1cda30a88 100644 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ b/pyarrow-stubs/__lib_pxi/array.pyi @@ -1656,9 +1656,10 @@ class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): """ _CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) _ScalarT = TypeVar("_ScalarT", bound=Scalar) -class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): """ The base class for all Arrow arrays. """ @@ -1735,7 +1736,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): ------- view : Array """ - def sum(self, **kwargs) -> _ScalarT: + def sum(self, **kwargs) -> _Scalar_co: """ Sum the values in a numerical array. @@ -1906,7 +1907,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): only be counted once. """ def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_ScalarT]: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... def to_string( self, *, @@ -1987,7 +1988,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_ScalarT]): A new array with nulls replaced by the given value. """ @overload - def __getitem__(self, key: int) -> _ScalarT: ... + def __getitem__(self, key: int) -> _Scalar_co: ... @overload def __getitem__(self, key: slice) -> Self: ... def __getitem__(self, key): diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi index 38d13679dec..81ab5012067 100644 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ b/pyarrow-stubs/__lib_pxi/scalar.pyi @@ -32,13 +32,14 @@ from .types import ( _AsPyTypeK = TypeVar("_AsPyTypeK") _AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) -class Scalar(_Weakrefable, Generic[_DataTypeT]): +class Scalar(_Weakrefable, Generic[_DataType_co]): """ The base class for scalars. """ @property - def type(self) -> _DataTypeT: + def type(self) -> _DataType_co: """ Data type of the Scalar object. """ diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi index 1ccbf67dab8..ad9d0392137 100644 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ b/pyarrow-stubs/__lib_pxi/table.pyi @@ -61,6 +61,7 @@ from .tensor import Tensor from .types import _AsPyType, _BasicDataType, _DataTypeT _ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) _Aggregation: TypeAlias = Literal[ "all", @@ -118,7 +119,7 @@ NullarySelector: TypeAlias = tuple[()] NarySelector: TypeAlias = list[str] | tuple[str, ...] ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector -class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): """ An array-like composed from a (possibly empty) collection of pyarrow.Arrays @@ -295,7 +296,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): @overload def __getitem__(self, key: slice) -> Self: ... @overload - def __getitem__(self, key: int) -> _ScalarT: ... + def __getitem__(self, key: int) -> _Scalar_co: ... def __getitem__(self, key): """ Slice or return value at given index @@ -630,7 +631,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): >>> n_legs.type DataType(int64) """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_ScalarT]: + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: """ Flatten this ChunkedArray into a single non-chunked array. @@ -672,7 +673,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): 100 ] """ - def unique(self) -> ChunkedArray[_ScalarT]: + def unique(self) -> ChunkedArray[_Scalar_co]: """ Compute distinct elements in array @@ -1128,7 +1129,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): >>> n_legs.num_chunks 2 """ - def chunk(self, i: int) -> ChunkedArray[_ScalarT]: + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: """ Select a chunk by its index. @@ -1153,7 +1154,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): ] """ @property - def chunks(self) -> list[Array[_ScalarT]]: + def chunks(self) -> list[Array[_Scalar_co]]: """ Convert to a list of single-chunked arrays. @@ -1387,7 +1388,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_ScalarT]): 1 """ - def __iter__(self) -> Iterator[_ScalarT]: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... def to_pylist( self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], *, From 6bedee748bc74feb8513b24bf43d64b24c7fddc8 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Wed, 16 Jul 2025 10:27:43 +0800 Subject: [PATCH 215/231] release 20.0.0.20250716 (#254) --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index d4db2971501..033527552a9 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1281,8 +1281,8 @@ packages: requires_python: '>=3.9' - pypi: ./ name: pyarrow-stubs - version: 20.0.0.20250715 - sha256: 61d7f3aca105acaa3d003ccae176470daa6fe9fd13f84aa99381aaa9762df3f1 + version: 20.0.0.20250716 + sha256: a69c85a5072346ec9e350e151f522b6b522b1083b6e85c5adb3fb51975ac8c56 requires_dist: - pyarrow>=20 requires_python: '>=3.9,<4' diff --git a/pyproject.toml b/pyproject.toml index 284debd7251..db614454ac3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ [project] name = "pyarrow-stubs" -version = "20.0.0.20250715" +version = "20.0.0.20250716" description = "Type annotations for pyarrow" authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] license = "BSD-2-Clause" From 048a7e80c7cf7c59656a0e9768d6acf3791a9aa0 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sat, 19 Jul 2025 12:07:54 +0200 Subject: [PATCH 216/231] Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 --- python/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 python/py.typed diff --git a/python/py.typed b/python/py.typed new file mode 100644 index 00000000000..e69de29bb2d From fa37f9871888f3abfa2c154607e282e34265f2ab Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sat, 19 Jul 2025 12:15:07 +0200 Subject: [PATCH 217/231] Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu --- python/stubs/__init__.pyi | 656 ++ python/stubs/__lib_pxi/__init__.pyi | 0 python/stubs/__lib_pxi/array.pyi | 4274 ++++++++++ python/stubs/__lib_pxi/benchmark.pyi | 1 + python/stubs/__lib_pxi/builder.pyi | 89 + python/stubs/__lib_pxi/compat.pyi | 5 + python/stubs/__lib_pxi/config.pyi | 41 + python/stubs/__lib_pxi/device.pyi | 88 + python/stubs/__lib_pxi/error.pyi | 53 + python/stubs/__lib_pxi/io.pyi | 1474 ++++ python/stubs/__lib_pxi/ipc.pyi | 705 ++ python/stubs/__lib_pxi/memory.pyi | 174 + python/stubs/__lib_pxi/pandas_shim.pyi | 51 + python/stubs/__lib_pxi/scalar.pyi | 1017 +++ python/stubs/__lib_pxi/table.pyi | 5609 +++++++++++++ python/stubs/__lib_pxi/tensor.pyi | 688 ++ python/stubs/__lib_pxi/types.pyi | 4413 ++++++++++ python/stubs/_azurefs.pyi | 74 + python/stubs/_compute.pyi | 1721 ++++ python/stubs/_csv.pyi | 641 ++ python/stubs/_cuda.pyi | 556 ++ python/stubs/_dataset.pyi | 2299 ++++++ python/stubs/_dataset_orc.pyi | 6 + python/stubs/_dataset_parquet.pyi | 314 + python/stubs/_dataset_parquet_encryption.pyi | 85 + python/stubs/_feather.pyi | 29 + python/stubs/_flight.pyi | 1380 ++++ python/stubs/_fs.pyi | 1005 +++ python/stubs/_gcsfs.pyi | 83 + python/stubs/_hdfs.pyi | 75 + python/stubs/_json.pyi | 169 + python/stubs/_orc.pyi | 56 + python/stubs/_parquet.pyi | 445 + python/stubs/_parquet_encryption.pyi | 67 + python/stubs/_s3fs.pyi | 74 + python/stubs/_stubs_typing.pyi | 80 + python/stubs/_substrait.pyi | 39 + python/stubs/acero.pyi | 85 + python/stubs/benchmark.pyi | 3 + python/stubs/cffi.pyi | 4 + python/stubs/compute.pyi | 7779 ++++++++++++++++++ python/stubs/csv.pyi | 27 + python/stubs/cuda.pyi | 25 + python/stubs/dataset.pyi | 229 + python/stubs/feather.pyi | 50 + python/stubs/flight.pyi | 95 + python/stubs/fs.pyi | 77 + python/stubs/gandiva.pyi | 65 + python/stubs/interchange/__init__.pyi | 0 python/stubs/interchange/buffer.pyi | 58 + python/stubs/interchange/column.pyi | 252 + python/stubs/interchange/dataframe.pyi | 102 + python/stubs/interchange/from_dataframe.pyi | 244 + python/stubs/ipc.pyi | 123 + python/stubs/json.pyi | 3 + python/stubs/lib.pyi | 106 + python/stubs/orc.pyi | 279 + python/stubs/pandas_compat.pyi | 54 + python/stubs/parquet/__init__.pyi | 1 + python/stubs/parquet/core.pyi | 2061 +++++ python/stubs/parquet/encryption.pyi | 15 + python/stubs/substrait.pyi | 21 + python/stubs/types.pyi | 194 + python/stubs/util.pyi | 27 + 64 files changed, 40515 insertions(+) create mode 100644 python/stubs/__init__.pyi create mode 100644 python/stubs/__lib_pxi/__init__.pyi create mode 100644 python/stubs/__lib_pxi/array.pyi create mode 100644 python/stubs/__lib_pxi/benchmark.pyi create mode 100644 python/stubs/__lib_pxi/builder.pyi create mode 100644 python/stubs/__lib_pxi/compat.pyi create mode 100644 python/stubs/__lib_pxi/config.pyi create mode 100644 python/stubs/__lib_pxi/device.pyi create mode 100644 python/stubs/__lib_pxi/error.pyi create mode 100644 python/stubs/__lib_pxi/io.pyi create mode 100644 python/stubs/__lib_pxi/ipc.pyi create mode 100644 python/stubs/__lib_pxi/memory.pyi create mode 100644 python/stubs/__lib_pxi/pandas_shim.pyi create mode 100644 python/stubs/__lib_pxi/scalar.pyi create mode 100644 python/stubs/__lib_pxi/table.pyi create mode 100644 python/stubs/__lib_pxi/tensor.pyi create mode 100644 python/stubs/__lib_pxi/types.pyi create mode 100644 python/stubs/_azurefs.pyi create mode 100644 python/stubs/_compute.pyi create mode 100644 python/stubs/_csv.pyi create mode 100644 python/stubs/_cuda.pyi create mode 100644 python/stubs/_dataset.pyi create mode 100644 python/stubs/_dataset_orc.pyi create mode 100644 python/stubs/_dataset_parquet.pyi create mode 100644 python/stubs/_dataset_parquet_encryption.pyi create mode 100644 python/stubs/_feather.pyi create mode 100644 python/stubs/_flight.pyi create mode 100644 python/stubs/_fs.pyi create mode 100644 python/stubs/_gcsfs.pyi create mode 100644 python/stubs/_hdfs.pyi create mode 100644 python/stubs/_json.pyi create mode 100644 python/stubs/_orc.pyi create mode 100644 python/stubs/_parquet.pyi create mode 100644 python/stubs/_parquet_encryption.pyi create mode 100644 python/stubs/_s3fs.pyi create mode 100644 python/stubs/_stubs_typing.pyi create mode 100644 python/stubs/_substrait.pyi create mode 100644 python/stubs/acero.pyi create mode 100644 python/stubs/benchmark.pyi create mode 100644 python/stubs/cffi.pyi create mode 100644 python/stubs/compute.pyi create mode 100644 python/stubs/csv.pyi create mode 100644 python/stubs/cuda.pyi create mode 100644 python/stubs/dataset.pyi create mode 100644 python/stubs/feather.pyi create mode 100644 python/stubs/flight.pyi create mode 100644 python/stubs/fs.pyi create mode 100644 python/stubs/gandiva.pyi create mode 100644 python/stubs/interchange/__init__.pyi create mode 100644 python/stubs/interchange/buffer.pyi create mode 100644 python/stubs/interchange/column.pyi create mode 100644 python/stubs/interchange/dataframe.pyi create mode 100644 python/stubs/interchange/from_dataframe.pyi create mode 100644 python/stubs/ipc.pyi create mode 100644 python/stubs/json.pyi create mode 100644 python/stubs/lib.pyi create mode 100644 python/stubs/orc.pyi create mode 100644 python/stubs/pandas_compat.pyi create mode 100644 python/stubs/parquet/__init__.pyi create mode 100644 python/stubs/parquet/core.pyi create mode 100644 python/stubs/parquet/encryption.pyi create mode 100644 python/stubs/substrait.pyi create mode 100644 python/stubs/types.pyi create mode 100644 python/stubs/util.pyi diff --git a/python/stubs/__init__.pyi b/python/stubs/__init__.pyi new file mode 100644 index 00000000000..8a0d1e870c5 --- /dev/null +++ b/python/stubs/__init__.pyi @@ -0,0 +1,656 @@ +# ruff: noqa: F401, I001, E402 +__version__: str + +import pyarrow.lib as _lib + +_gc_enabled: bool + +from pyarrow.lib import ( + BuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + PyExtensionType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + +# Buffers, allocation +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "get_library_dirs", +] diff --git a/python/stubs/__lib_pxi/__init__.pyi b/python/stubs/__lib_pxi/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi new file mode 100644 index 00000000000..ec1cda30a88 --- /dev/null +++ b/python/stubs/__lib_pxi/array.pyi @@ -0,0 +1,4274 @@ +import datetime as dt +import sys + +from collections.abc import Callable +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + Any, + Generic, + Iterable, + Iterator, + Literal, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.base import ExtensionDtype +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + MonthDayNano, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated + +from . import scalar, types +from .device import DeviceAllocationType +from .scalar import NullableCollection, Scalar +from .types import ( + DataType, + Field, + MapType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, +) + +@overload +def array( + values: NullableCollection[bool], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: NullableCollection[int], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: NullableCollection[float], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: NullableCollection[Decimal], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Decimal128Array: ... +@overload +def array( + values: NullableCollection[dict[str, Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def array( + values: NullableCollection[dt.date], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: NullableCollection[dt.time], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[dt.timedelta], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[MonthDayNano], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: NullableCollection[str], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: NullableCollection[bytes], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: NullableCollection[list[Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ListArray[Any]: ... +@overload +def array( + values: NullableCollection[_ScalarT], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[_ScalarT]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> NullArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> HalfFloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> FloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[Scalar[_DataTypeT]]: ... +def array(*args, **kawrgs): + """ + Create pyarrow.Array instance from a Python object. + + Parameters + ---------- + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + +@overload +def asarray(values: NullableCollection[bool]) -> BooleanArray: ... +@overload +def asarray(values: NullableCollection[int]) -> Int64Array: ... +@overload +def asarray(values: NullableCollection[float]) -> DoubleArray: ... +@overload +def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... +@overload +def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... +@overload +def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... +@overload +def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... +@overload +def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... +@overload +def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... +@overload +def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, +) -> NullArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, +) -> BooleanArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, +) -> Int8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, +) -> Int16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, +) -> Int32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, +) -> Int64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> UInt8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> UInt16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> UInt32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> UInt64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> HalfFloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> FloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> DoubleArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, +) -> StringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, +) -> BinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> LargeStringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, +) -> LargeBinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, +) -> BinaryViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, +) -> StringViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> Date32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> Date64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> Time32Array[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> Time32Array[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> Time64Array[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> Time64Array[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> TimestampArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> TimestampArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> TimestampArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> TimestampArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> DurationArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> DurationArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> DurationArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> DurationArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> MonthDayNanoIntervalArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, +) -> Array[Scalar[_DataTypeT]]: ... +def asarray(*args, **kwargs): + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + +@overload +def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... +@overload +def nulls( + size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def nulls( + size: int, type: types.BoolType, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... +@overload +def nulls( + size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def nulls( + size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def nulls( + size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def nulls( + size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def nulls( + size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def nulls( + size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def nulls( + size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def nulls( + size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def nulls( + size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def nulls( + size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def nulls( + size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def nulls( + size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def nulls( + size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def nulls( + size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def nulls( + size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None +) -> Time64Array[types._Time64Unit]: ... +@overload +def nulls( + size: int, + type: types.TimestampType[types._Unit, types._Tz], + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def nulls( + size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None +) -> DurationArray[types._Unit]: ... +@overload +def nulls( + size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None +) -> MonthDayNanoIntervalArray: ... +@overload +def nulls( + size: int, + type: types.BinaryType, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def nulls( + size: int, + type: types.LargeBinaryType, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.FixedSizeBinaryType, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.StringType, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def nulls( + size: int, + type: types.LargeStringType, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def nulls( + size: int, + type: types.BinaryViewType, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def nulls( + size: int, + type: types.StringViewType, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def nulls( + size: int, + type: types.LargeListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.ListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.LargeListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.FixedSizeListType[_DataTypeT, _Size], + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def nulls( + size: int, + type: types.ListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def nulls( + size: int, + type: types.StructType, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def nulls( + size: int, + type: types.MapType[_MapKeyT, _MapItemT], + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def nulls( + size: int, + type: types.DictionaryType[_IndexT, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.RunEndEncodedType[_RunEndType, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.UnionType, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def nulls( + size: int, + type: types.FixedShapeTensorType[types._ValueT], + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def nulls( + size: int, + type: types.Bool8Type, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def nulls( + size: int, + type: types.UuidType, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def nulls( + size: int, + type: types.JsonType, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def nulls( + size: int, + type: types.OpaqueType, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def nulls( + size: int, + type: types.ExtensionType, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def nulls(*args, **kwargs): + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + +@overload +def repeat( + value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def repeat( # type: ignore[overload-overlap] + value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def repeat( + value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int8Array: ... +@overload +def repeat( + value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def repeat( + value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def repeat( + value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def repeat( + value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def repeat( + value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def repeat( + value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def repeat( + value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def repeat( + value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def repeat( + value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def repeat( + value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def repeat( + value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal32Array: ... +@overload +def repeat( + value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal64Array: ... +@overload +def repeat( + value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def repeat( + value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def repeat( + value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def repeat( + value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def repeat( + value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def repeat( + value: dt.time | scalar.Time64Scalar[types._Time64Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> Time64Array[types._Time64Unit]: ... +@overload +def repeat( + value: scalar.TimestampScalar[types._Unit, types._Tz], + size: int, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def repeat( + value: dt.timedelta | scalar.DurationScalar[types._Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> DurationArray[types._Unit]: ... +@overload +def repeat( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def repeat( + value: bytes | scalar.BinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def repeat( + value: scalar.LargeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def repeat( + value: scalar.FixedSizeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def repeat( + value: str | scalar.StringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def repeat( + value: scalar.LargeStringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def repeat( + value: scalar.BinaryViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def repeat( + value: scalar.StringViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def repeat( + value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def repeat( + value: scalar.FixedSizeListScalar[_DataTypeT, _Size], + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def repeat( + value: scalar.LargeListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.ListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.LargeListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: dict[str, Any] | scalar.StructScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def repeat( + value: scalar.MapScalar[_MapKeyT, _MapItemT], + size: int, + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def repeat( + value: scalar.DictionaryScalar[_IndexT, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def repeat( + value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def repeat( + value: scalar.UnionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def repeat( + value: scalar.FixedShapeTensorScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def repeat( + value: scalar.Bool8Scalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def repeat( + value: scalar.UuidScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def repeat( + value: scalar.JsonScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def repeat( + value: scalar.OpaqueScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def repeat( + value: scalar.ExtensionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def repeat(*args, **kwargs): + """ + Create an Array instance whose slots are the given scalar. + + Parameters + ---------- + value : Scalar-like object + Either a pyarrow.Scalar or any python object coercible to a Scalar. + size : int + Number of times to repeat the scalar in the output Array. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.repeat(10, 3) + + [ + 10, + 10, + 10 + ] + + >>> pa.repeat([1, 2], 2) + + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + + >>> pa.repeat("string", 3) + + [ + "string", + "string", + "string" + ] + + >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 1 + ] + -- child 1 type: list + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + """ + +def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: + """ + Attempt to infer Arrow data type that can hold the passed Python + sequence type in an Array object + + Parameters + ---------- + values : array-like + Sequence to infer type from. + mask : ndarray (bool type), optional + Optional exclusion mask where True marks null, False non-null. + from_pandas : bool, default False + Use pandas's NA/null sentinel values for type inference. + + Returns + ------- + type : DataType + """ + +class ArrayStatistics(_Weakrefable): + """ + The class for statistics of an array. + """ + @property + def null_count(self) -> int: + """ + The number of nulls. + """ + @property + def distinct_count(self) -> int: + """ + The number of distinct values. + """ + @property + def min(self) -> Any: + """ + The minimum value. + """ + @property + def is_min_exact(self) -> bool: + """ + Whether the minimum value is an exact value or not. + """ + @property + def max(self) -> Any: + """ + The maximum value. + """ + + @property + def is_max_exact(self) -> bool: + """ + Whether the maximum value is an exact value or not. + """ + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + categories : list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures. + strings_to_categorical : bool, default False + Encode string (UTF8) and binary types to pandas.Categorical. + zero_copy_only : bool, default False + Raise an ArrowException if this function call would require copying + the underlying data. + integer_object_nulls : bool, default False + Cast integers with nulls to objects + date_as_object : bool, default True + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. + timestamp_as_object : bool, default False + Cast non-nanosecond timestamps (np.datetime64) to objects. This is + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. + use_threads : bool, default True + Whether to parallelize the conversion using multiple threads. + deduplicate_objects : bool, default True + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata : bool, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + safe : bool, default True + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks : bool, default False + If True, generate one internal "block" for each column when + creating a pandas.DataFrame from a RecordBatch or Table. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct : bool, default False + EXPERIMENTAL: If True, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling to_pandas with this option it will crash your + program. + + Note that you may not see always memory usage improvements. For + example, if multiple columns share an underlying allocation, + memory can't be freed until all columns are converted. + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. + types_mapper : function, default None + A function mapping a pyarrow DataType to a pandas ExtensionDtype. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of pandas_metadata in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas ExtensionDtype or ``None`` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). + + Returns + ------- + pandas.Series or pandas.DataFrame depending on type of object + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + + Convert a Table to pandas DataFrame: + + >>> table = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> table.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(table.to_pandas(), pd.DataFrame) + True + + Convert a RecordBatch to pandas DataFrame: + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(batch.to_pandas(), pd.DataFrame) + True + + Convert a Chunked Array to pandas Series: + + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_pandas() + 0 2 + 1 2 + 2 4 + 3 4 + 4 5 + 5 100 + dtype: int64 + >>> isinstance(n_legs.to_pandas(), pd.Series) + True + """ + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + The base class for all Arrow arrays. + """ + + def diff(self, other: Self) -> str: + """ + Compare contents of this array against another one. + + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> import pyarrow as pa + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) # doctest: +SKIP + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + """ + def cast( + self, + target_type: _CastAs, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + cast : Array + """ + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: + """ + Return zero-copy "view" of array as another data type. + + The data types must have compatible columnar buffer layouts + + Parameters + ---------- + target_type : DataType + Type to construct view as. + + Returns + ------- + view : Array + """ + def sum(self, **kwargs) -> _Scalar_co: + """ + Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. + """ + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: + """ + Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. + """ + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. + """ + def value_count(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + StructArray + An array of structs + """ + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar]: ... + @staticmethod + def from_pandas(*args, **kwargs): + """ + Convert pandas.Series to an Arrow Array. + + This method uses Pandas semantics about what values indicate + nulls. See pyarrow.array for more general conversion from arrays or + sequences to Arrow arrays. + + Parameters + ---------- + obj : ndarray, pandas.Series, array-like + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + ChunkedArray is returned if object data overflows binary buffer. + """ + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: + """ + Construct an Array from a sequence of buffers. + + The concrete type returned depends on the datatype. + + Parameters + ---------- + type : DataType + The value type of the array. + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array], default None + Nested type children with length matching type.num_fields. + + Returns + ------- + array : Array + """ + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the array. + + In other words, the sum of bytes from all buffer + ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + """ + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the Array. + + Note: for data on a non-CPU device, the full array is copied to CPU + memory. + + Parameters + ---------- + indent : int, default 2 + How much to indent the internal items in the string to + the right, by default ``2``. + top_level_indent : int, default 0 + How much to indent right the entire content of the array, + by default ``0``. + window : int + How many primitive items to preview at the begin and end + of the array when the array is bigger than the window. + The other items will be ellipsed. + container_window : int + How many container items (such as a list in a list array) + to preview at the begin and end of the array when the array + is bigger than the window. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + """ + format = to_string + def equals(self, other: Self) -> bool: ... + def __len__(self) -> int: ... + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: + """ + Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array + """ + def is_nan(self) -> BooleanArray: + """ + Return BooleanArray indicating the NaN values. + + Returns + ------- + array : boolean Array + """ + def is_valid(self) -> BooleanArray: + """ + Return BooleanArray indicating the non-null values. + """ + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: + """ + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. + """ + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or Array (slice) + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this array. + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice. + length : int, default None + Length of slice (default is until end of Array starting from + offset). + + Returns + ------- + sliced : Array + An array with the same datatype, containing the sliced values. + """ + def take(self, indices: Indices) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. + """ + def drop_null(self) -> Self: + """ + Remove missing values from an array. + """ + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. + """ + @overload + def index( + self: Array[_ScalarT], + value: _ScalarT, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + @overload + def index( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + value: _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: + """ + Return a NumPy view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for primitive arrays with the same memory layout as NumPy + (i.e. integers, floating point, ..) and without any nulls. + + For the extension arrays, this method simply delegates to the + underlying storage array. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + def to_pylist( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + *, + map_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + lst : list + """ + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def offset(self) -> int: + """ + A relative position into another array's data. + + The purpose is to enable zero-copy slicing. This value defaults to zero + but must be applied on all operations with the physical storage + buffers. + """ + def buffers(self) -> list[Buffer | None]: + """ + Return a list of Buffer objects pointing to this array's physical + storage. + + To correctly interpret these buffers, you need to also apply the offset + multiplied with the size of the stored data type. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Construct a copy of the array with all buffers on destination + device. + + This method recursively copies the array's buffers and those of its + children onto the destination MemoryManager device and returns the + new Array. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + Array + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: + """ + Import Array from a C ArrowDeviceArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + def __dlpack_device__(self) -> tuple[int, int]: + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + + @property + def is_cpu(self) -> bool: + """ + Whether the array is CPU-accessible. + """ + @property + def statistics(self) -> ArrayStatistics | None: + """ + Statistics of the array. + """ + +class NullArray(Array[scalar.NullScalar]): ... + +class BooleanArray(Array[scalar.BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + +class NumericArray(Array[_ScalarT]): ... +class IntegerArray(NumericArray[_ScalarT]): ... +class FloatingPointArray(NumericArray[_ScalarT]): ... +class Int8Array(IntegerArray[scalar.Int8Scalar]): ... +class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... +class Int16Array(IntegerArray[scalar.Int16Scalar]): ... +class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... +class Int32Array(IntegerArray[scalar.Int32Scalar]): ... +class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... +class Int64Array(IntegerArray[scalar.Int64Scalar]): ... +class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... +class Date32Array(NumericArray[scalar.Date32Scalar]): ... +class Date64Array(NumericArray[scalar.Date64Scalar]): ... +class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... +class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... +class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... +class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... +class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... +class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... +class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... +class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... +class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... +class Decimal32Array(FixedSizeBinaryArray): ... +class Decimal64Array(FixedSizeBinaryArray): ... +class Decimal128Array(FixedSizeBinaryArray): ... +class Decimal256Array(FixedSizeBinaryArray): ... + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + def value_parent_indices(self) -> Int64Array: ... + def value_lengths(self) -> Int32Array: ... + +class ListArray(BaseListArray[_ScalarT]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[int], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[float], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[str], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.StringType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[bytes], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list, + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array | list, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListArray from arrays of int32 offsets and values. + + Parameters + ---------- + offsets : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : ListArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + ListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, 5]]) + >>> array.offsets + + [ + 0, + 2, + 2, + 5 + ] + """ + +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : LargeListArray + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from the sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, 4, None, 6]], + ... type=pa.large_list(pa.int32()), + ... ) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + """ + +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int32Array: + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int64Array: + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, None]: ... + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + limit_size: _Size, + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct FixedSizeListArray from array of values and a list length. + + Parameters + ---------- + values : Array (any type) + list_size : int + The fixed length of the lists. + type : DataType, optional + If not specified, a default ListType with the values' type and + `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + + Returns + ------- + FixedSizeListArray + + Examples + -------- + + Create from a values array and a list size: + + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + + Or create from a values array, list size and matching type: + + >>> typ = pa.list_(pa.field("values", pa.int64()), 2) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: + """ + Return the underlying array of values which backs the + FixedSizeListArray. + + Note even null elements are included. + + Compare with :meth:`flatten`, which returns only the non-null + sub-list values. + + Returns + ------- + values : Array + + See Also + -------- + FixedSizeListArray.flatten : ... + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) + >>> array.values + + [ + 1, + 2, + null, + null, + 3, + null + ] + + """ + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + +class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + keys: Array[Scalar[_MapKeyT]], + items: Array[Scalar[_MapItemT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @overload + @classmethod + def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] + cls, + offsets: Int64Array, + values: Array, + *, + type: MapType[_MapKeyT, _MapItemT], + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """ + Construct MapArray from arrays of int32 offsets and key, item arrays. + + Parameters + ---------- + offsets : array-like or sequence (int32 type) + keys : array-like or sequence (any type) + items : array-like or sequence (any type) + type : DataType, optional + If not specified, a default MapArray with the keys' and items' type is used. + pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + map_array : MapArray + + Examples + -------- + First, let's understand the structure of our dataset when viewed in a rectangular data model. + The total of 5 respondents answered the question "How much did you like the movie x?". + The value -1 in the integer array means that the value is missing. The boolean array + represents the null bitmask corresponding to the missing values in the integer array. + + >>> import pyarrow as pa + >>> movies_rectangular = np.ma.masked_array( + ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ], + ... ) + + To represent the same data with the MapArray and from_arrays, the data is + formed like this: + + >>> offsets = [ + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end + ... ] + >>> movies = [ + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", + ... "Meet the Parents", + ... "Superman", # -- row 2 + ... "Meet the Parents", + ... "Superman", # ----------------- row 3 + ... ] + >>> likings = [ + ... 10, # -------- row 1 + ... 8, + ... 4, + ... 5, # --- row 2 + ... 10, + ... 3, # ------ row 3 + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 [] + 4 [] + dtype: object + + If the data in the empty rows needs to be marked as missing, it's possible + to do so by modifying the offsets argument, so that we specify `None` as + the starting positions of the rows we want marked as missing. The end row + offset still has to refer to the existing value from keys (and values): + + >>> offsets = [ + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 None + 4 None + dtype: object + """ + @property + def keys(self) -> Array: + """Flattened array of keys across all maps in array""" + @property + def items(self) -> Array: + """Flattened array of items across all maps in array""" + +class UnionArray(Array[scalar.UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: + """ + DEPRECATED, use field() instead. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : pyarrow.Field + The given child field. + """ + def field(self, pos: int) -> Array: + """ + Return the given child field as an individual array. + + For sparse unions, the returned array has its offset, length, + and null count adjusted. + + For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. + """ + @property + def type_codes(self) -> Int8Array: + """Get the type codes array.""" + @property + def offsets(self) -> Int32Array: + """ + Get the value offsets array (dense arrays only). + + Does not account for any slice offset. + """ + @staticmethod + def from_dense( + type: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + +class StringArray(Array[scalar.StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a StringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class LargeStringArray(Array[scalar.LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class StringViewArray(Array[scalar.StringViewScalar]): ... + +class BinaryArray(Array[scalar.BinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this BinaryArray. + """ + +class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this LargeBinaryArray. + """ + +class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... + +class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: + """ + Decodes the DictionaryArray to an Array. + """ + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: + """ + Construct a DictionaryArray from buffers. + + Parameters + ---------- + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing the indices array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the indices array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + + Returns + ------- + dict_array : DictionaryArray + """ + @staticmethod + def from_arrays( + indices: Indices, + dictionary: Array | np.ndarray | pd.Series, + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: + """ + Construct a DictionaryArray from indices and values. + + Parameters + ---------- + indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type + Non-negative integers referencing the dictionary values by zero + based index. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + mask : ndarray or pandas.Series, bool type + True values indicate that indices are actually null. + ordered : bool, default False + Set to True if the category values are ordered. + from_pandas : bool, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1). + safe : bool, default True + If True, check that the dictionary indices are in range. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray + """ + +class StructArray(Array[scalar.StructScalar]): + def field(self, index: int | str) -> Array: + """ + Retrieves the child array belonging to field. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Array + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: + """ + Return one individual array for each field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : List[Array] + """ + @staticmethod + def from_arrays( + arrays: Iterable[Array], + names: list[str] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: types.StructType | None = None, + ) -> StructArray: + """ + Construct StructArray from collection of arrays representing + each field in the struct. + + Either field names, field instances or a struct type must be passed. + + Parameters + ---------- + arrays : sequence of Array + names : List[str] (optional) + Field names for each struct child. + fields : List[Field] (optional) + Field instances for each struct child. + mask : pyarrow.Array[bool] (optional) + Indicate which values are null (True) or not null (False). + memory_pool : MemoryPool (optional) + For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. + + Returns + ------- + result : StructArray + """ + def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ + +class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @overload + @staticmethod + def from_arrays( + run_ends: Int16Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int32Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int64Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... + @staticmethod + def from_arrays(*args, **kwargs): + """ + Construct RunEndEncodedArray from run_ends and values arrays. + + Parameters + ---------- + run_ends : Array (int16, int32, or int64 type) + The run_ends array. + values : Array (any type) + The values array. + type : pyarrow.DataType, optional + The run_end_encoded(run_end_type, value_type) array type. + + Returns + ------- + RunEndEncodedArray + """ + @staticmethod + def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] + type: DataType, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: + """ + Construct a RunEndEncodedArray from all the parameters that make up an + Array. + + RunEndEncodedArrays do not have buffers, only children arrays, but this + implementation is needed to satisfy the Array interface. + + Parameters + ---------- + type : DataType + The run_end_encoded(run_end_type, value_type) type. + length : int + The logical length of the run-end encoded array. Expected to match + the last value of the run_ends array (children[0]) minus the offset. + buffers : List[Buffer] + Empty List or [None]. + null_count : int, default -1 + The number of null entries in the array. Run-end encoded arrays + are specified to not have valid bits and null_count always equals 0. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array] + Nested type children containing the run_ends and values arrays. + + Returns + ------- + RunEndEncodedArray + """ + @property + def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: + """ + An array holding the logical indexes of each run-end. + + The physical offset to the array is applied. + """ + @property + def values(self) -> Array[scalar.Scalar[_BasicValueT]]: + """ + An array holding the values of each run. + + The physical offset to the array is applied. + """ + def find_physical_offset(self) -> int: + """ + Find the physical offset of this REE array. + + This is the offset of the run that contains the value of the first + logical element of this array considering its offset. + + This function uses binary-search, so it has a O(log N) cost. + """ + def find_physical_length(self) -> int: + """ + Find the physical length of this REE array. + + The physical length of an REE is the number of physical values (and + run-ends) necessary to represent the logical range of values from offset + to length. + + This function uses binary-search, so it has a O(log N) cost. + """ + +_ArrayT = TypeVar("_ArrayT", bound=Array) + +class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: + """ + Construct ExtensionArray from type and storage array. + + Parameters + ---------- + typ : DataType + The extension type for the result array. + storage : Array + The underlying storage for the result array. + + Returns + ------- + ext_array : ExtensionArray + """ + +class JsonArray(ExtensionArray[_ArrayT]): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class UuidArray(ExtensionArray[_ArrayT]): ... + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self) -> np.ndarray: + """ + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + @classmethod + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + If input array data is not contiguous a copy will be made. + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + +class OpaqueArray(ExtensionArray[_ArrayT]): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array + + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ + +class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + +def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: + """ + Concatenate the given arrays. + + The contents of the input arrays are copied into the returned array. + + Raises + ------ + ArrowInvalid + If not all of the arrays have the same type. + + Parameters + ---------- + arrays : iterable of pyarrow.Array + Arrays to concatenate, must be identically typed. + memory_pool : MemoryPool, default None + For memory allocations. If None, the default pool is used. + + Examples + -------- + >>> import pyarrow as pa + >>> arr1 = pa.array([2, 4, 5, 100]) + >>> arr2 = pa.array([2, 4]) + >>> pa.concat_arrays([arr1, arr2]) + + [ + 2, + 4, + 5, + 100, + 2, + 4 + ] + + """ + +def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: + """ + Create empty array of the given type. + """ + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", +] diff --git a/python/stubs/__lib_pxi/benchmark.pyi b/python/stubs/__lib_pxi/benchmark.pyi new file mode 100644 index 00000000000..66981bf0f51 --- /dev/null +++ b/python/stubs/__lib_pxi/benchmark.pyi @@ -0,0 +1 @@ +def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/stubs/__lib_pxi/builder.pyi b/python/stubs/__lib_pxi/builder.pyi new file mode 100644 index 00000000000..4a0e9ca4708 --- /dev/null +++ b/python/stubs/__lib_pxi/builder.pyi @@ -0,0 +1,89 @@ +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + +class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringViewArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/stubs/__lib_pxi/compat.pyi b/python/stubs/__lib_pxi/compat.pyi new file mode 100644 index 00000000000..ae667be453e --- /dev/null +++ b/python/stubs/__lib_pxi/compat.pyi @@ -0,0 +1,5 @@ +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/stubs/__lib_pxi/config.pyi b/python/stubs/__lib_pxi/config.pyi new file mode 100644 index 00000000000..166e10c9734 --- /dev/null +++ b/python/stubs/__lib_pxi/config.pyi @@ -0,0 +1,41 @@ +from typing import NamedTuple + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/stubs/__lib_pxi/device.pyi b/python/stubs/__lib_pxi/device.pyi new file mode 100644 index 00000000000..d1b9f39eedd --- /dev/null +++ b/python/stubs/__lib_pxi/device.pyi @@ -0,0 +1,88 @@ +import enum + +from pyarrow.lib import _Weakrefable + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + +class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + + @property + def type_name(self) -> str: + """ + A shorthand for this device's type. + """ + @property + def device_id(self) -> int: + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + Return the DeviceAllocationType of this device. + """ + +class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ + @property + def device(self) -> Device: + """ + The device this MemoryManager is tied to. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + +def default_cpu_memory_manager() -> MemoryManager: + """ + Return the default CPU MemoryManager instance. + + The returned singleton instance uses the default MemoryPool. + """ + +__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/stubs/__lib_pxi/error.pyi b/python/stubs/__lib_pxi/error.pyi new file mode 100644 index 00000000000..981ed51e680 --- /dev/null +++ b/python/stubs/__lib_pxi/error.pyi @@ -0,0 +1,53 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi new file mode 100644 index 00000000000..d882fd79d57 --- /dev/null +++ b/python/stubs/__lib_pxi/io.pyi @@ -0,0 +1,1474 @@ +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex, overload + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from .types import KeyValueMetadata + +def have_libhdfs() -> bool: + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ + +def io_thread_count() -> int: + """ + Return the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. The number of threads is set to a fixed value at + startup. It can be modified at runtime by calling + :func:`set_io_thread_count()`. + + See Also + -------- + set_io_thread_count : Modify the size of this pool. + cpu_count : The analogous function for the CPU thread pool. + """ + +def set_io_thread_count(count: int) -> None: + """ + Set the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. + + Parameters + ---------- + count : int + The max number of threads that may be used for I/O. + Must be positive. + + See Also + -------- + io_thread_count : Get the size of this pool. + set_cpu_count : The analogous function for the CPU thread pool. + """ + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): + """ + The base class for all Arrow streams. + + Streams are either readable, writable, or both. + They optionally support seeking. + + While this class exposes methods to read or write data from Python, the + primary intent of using a Arrow stream is to pass it to other Arrow + facilities that will make use of it, such as Arrow IPC routines. + + Be aware that there are subtle differences with regular Python files, + e.g. destroying a writable Arrow stream without closing it explicitly + will not flush any pending data. + """ + + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: + """ + The file mode. Currently instances of NativeFile may support: + + * rb: binary read + * wb: binary write + * rb+: binary read and write + * ab: binary append + """ + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: + """ + Return file size + """ + def metadata(self) -> KeyValueMetadata: + """ + Return file metadata + """ + def tell(self) -> int: + """ + Return current stream position + """ + def seek(self, position: int, whence: int = 0) -> int: + """ + Change current file stream position + + Parameters + ---------- + position : int + Byte offset, interpreted relative to value of whence argument + whence : int, default 0 + Point of reference for seek offset + + Notes + ----- + Values of whence: + * 0 -- start of stream (the default); offset should be zero or positive + * 1 -- current stream position; offset may be negative + * 2 -- end of stream; offset is usually negative + + Returns + ------- + int + The new absolute stream position. + """ + def flush(self) -> None: + """ + Flush the stream, if applicable. + + An error is raised if stream is not writable. + """ + def write(self, data: bytes | SupportPyBuffer) -> int: + """ + Write data to the file. + + Parameters + ---------- + data : bytes-like object or exporter of buffer protocol + + Returns + ------- + int + nbytes: number of bytes written + """ + def read(self, nbytes: int | None = None) -> bytes: + """ + Read and return up to n bytes. + + If *nbytes* is None, then the entire remaining file contents are read. + + Parameters + ---------- + nbytes : int, default None + + Returns + ------- + data : bytes + """ + def get_stream(self, file_offset: int, nbytes: int) -> Self: + """ + Return an input stream that reads a file segment independent of the + state of the file. + + Allows reading portions of a random access file as an input stream + without interfering with each other. + + Parameters + ---------- + file_offset : int + nbytes : int + + Returns + ------- + stream : NativeFile + """ + def read_at(self) -> bytes: + """ + Read indicated number of bytes at offset from the file + + Parameters + ---------- + nbytes : int + offset : int + + Returns + ------- + data : bytes + """ + def read1(self) -> bytes: + """Read and return up to n bytes. + + Unlike read(), if *nbytes* is None then a chunk is read, not the + entire file. + + Parameters + ---------- + nbytes : int, default None + The maximum number of bytes to read. + + Returns + ------- + data : bytes + """ + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: + """ + Read into the supplied buffer + + Parameters + ---------- + b : buffer-like object + A writable buffer object (such as a bytearray). + + Returns + ------- + written : int + number of bytes written + """ + + def readline(self, size: int | None = None) -> bytes: + """Read and return a line of bytes from the file. + + If size is specified, read at most size bytes. + + Line terminator is always b"\\n". + + Parameters + ---------- + size : int + maximum number of bytes read + """ + def readlines(self, hint: int | None = None) -> list[bytes]: + """Read lines of the file + + Parameters + ---------- + hint : int + maximum number of bytes read until we stop + """ + def __iter__(self) -> Self: ... + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: + """ + Read from buffer. + + Parameters + ---------- + nbytes : int, optional + maximum number of bytes read + """ + def truncate(self) -> None: ... + def writelines(self, lines: list[bytes]): + """ + Write lines to the file. + + Parameters + ---------- + lines : iterable + Iterable of bytes-like objects or exporters of buffer protocol + """ + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: + """ + Read this file completely to a local path or destination stream. + + This method first seeks to the beginning of the file. + + Parameters + ---------- + stream_or_path : str or file-like object + If a string, a local file path to write to; otherwise, + should be a writable stream. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + def upload(self, stream: IOBase, buffer_size: int | None) -> None: + """ + Write from a source stream to this file. + + Parameters + ---------- + stream : file-like object + Source stream to pipe to this file. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + +# ---------------------------------------------------------------------- +# Python file-like objects + +class PythonFile(NativeFile): + """ + A stream backed by a Python file object. + + This class allows using Python file objects with arbitrary Arrow + functions, including functions written in another language than Python. + + As a downside, there is a non-zero redirection cost in translating + Arrow stream calls to Python method calls. Furthermore, Python's + Global Interpreter Lock may limit parallelism in some situations. + + Examples + -------- + >>> import io + >>> import pyarrow as pa + >>> pa.PythonFile(io.BytesIO()) + + + Create a stream for writing: + + >>> buf = io.BytesIO() + >>> f = pa.PythonFile(buf, mode="w") + >>> f.writable() + True + >>> f.write(b"PythonFile") + 10 + >>> buf.getvalue() + b'PythonFile' + >>> f.close() + >>> f + + + Create a stream for reading: + + >>> buf = io.BytesIO(b"PythonFile") + >>> f = pa.PythonFile(buf, mode="r") + >>> f.mode + 'rb' + >>> f.read() + b'PythonFile' + >>> f + + >>> f.close() + >>> f + + """ + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: + """ + Parameters + ---------- + pos : int, optional + """ + +class MemoryMappedFile(NativeFile): + """ + A stream that represents a memory-mapped file. + + Supports 'r', 'r+', 'w' modes. + + Examples + -------- + Create a new file with memory map: + + >>> import pyarrow as pa + >>> mmap = pa.create_memory_map("example_mmap.dat", 10) + >>> mmap + + >>> mmap.close() + + Open an existing file with memory map: + + >>> with pa.memory_map("example_mmap.dat") as mmap: + ... mmap + + """ + @classmethod + def create(cls, path: str, size: int) -> Self: + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + def resize(self, new_size: int) -> None: + """ + Resize the map and underlying file. + + Parameters + ---------- + new_size : new size in bytes + """ + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: + """ + Open memory map at file path. Size of the memory map cannot change. + + Parameters + ---------- + path : str + mode : {'r', 'r+', 'w'}, default 'r' + Whether the file is opened for reading ('r'), writing ('w') + or both ('r+'). + + Returns + ------- + mmap : MemoryMappedFile + + Examples + -------- + Reading from a memory map without any memory allocation or copying: + + >>> import pyarrow as pa + >>> with pa.output_stream("example_mmap.txt") as stream: + ... stream.write(b"Constructing a buffer referencing the mapped memory") + 51 + >>> with pa.memory_map("example_mmap.txt") as mmap: + ... mmap.read_at(6, 45) + b'memory' + """ + +create_memory_map = MemoryMappedFile.create + +class OSFile(NativeFile): + """ + A stream backed by a regular file descriptor. + + Examples + -------- + Create a new file to write to: + + >>> import pyarrow as pa + >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: + ... f.writable() + ... f.write(b"OSFile") + ... f.seekable() + True + 6 + False + + Open the file to read: + + >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: + ... f.mode + ... f.read() + 'rb' + b'OSFile' + + Open the file to append: + + >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: + ... f.mode + ... f.write(b" is super!") + 'ab' + 10 + >>> with pa.OSFile("example_osfile.arrow") as f: + ... f.read() + b'OSFile is super!' + + Inspect created OSFile: + + >>> pa.OSFile("example_osfile.arrow") + + """ + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"], + memory_pool: MemoryPool | None = None, + ) -> None: ... + +class FixedSizeBufferWriter(NativeFile): + """ + A stream writing to a Arrow buffer. + + Examples + -------- + Create a stream to write to ``pyarrow.Buffer``: + + >>> import pyarrow as pa + >>> buf = pa.allocate_buffer(5) + >>> with pa.output_stream(buf) as stream: + ... stream.write(b"abcde") + ... stream + 5 + + + Inspect the buffer: + + >>> buf.to_pybytes() + b'abcde' + >>> buf + + """ + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + def set_memcopy_threshold(self, threshold: int) -> None: ... + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + """ + The base class for all Arrow buffers. + + A buffer represents a contiguous memory area. Many buffers will own + their memory, though not all of them do. + """ + def __len__(self) -> int: ... + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: + """ + The buffer size in bytes. + """ + @property + def address(self) -> int: + """ + The buffer's address, as an integer. + + The returned address may point to CPU or device memory. + Use `is_cpu()` to disambiguate. + """ + def hex(self) -> bytes: + """ + Compute hexadecimal representation of the buffer. + + Returns + ------- + : bytes + """ + @property + def is_mutable(self) -> bool: + """ + Whether the buffer is mutable. + """ + @property + def is_cpu(self) -> bool: + """ + Whether the buffer is CPU-accessible. + """ + @property + def device(self) -> Device: + """ + The device where the buffer resides. + + Returns + ------- + Device + """ + @property + def memory_manager(self) -> MemoryManager: + """ + The memory manager associated with the buffer. + + Returns + ------- + MemoryManager + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the buffer resides. + + Returns + ------- + DeviceAllocationType + """ + @property + def parent(self) -> Buffer | None: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> int: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Slice this buffer. Memory is not copied. + + You can also use the Python slice notation ``buffer[start:stop]``. + + Parameters + ---------- + offset : int, default 0 + Offset from start of buffer to slice. + length : int, default None + Length of slice (default is until end of Buffer starting from + offset). + + Returns + ------- + sliced : Buffer + A logical view over this buffer. + """ + def equals(self, other: Self) -> bool: + """ + Determine if two buffers contain exactly the same data. + + Parameters + ---------- + other : Buffer + + Returns + ------- + are_equal : bool + True if buffer contents and size are equal + """ + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: + """ + Return this buffer as a Python bytes object. Memory is copied. + """ + def __buffer__(self, flags: int, /) -> memoryview: ... + +class ResizableBuffer(Buffer): + """ + A base class for buffers that can be resized. + """ + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: + """ + Resize buffer to indicated size. + + Parameters + ---------- + new_size : int + New size of buffer (padding may be added internally). + shrink_to_fit : bool, default False + If this is true, the buffer is shrunk when new_size is less + than the current size. + If this is false, the buffer is never shrunk. + """ + +@overload +def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[False] +) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[True] +) -> ResizableBuffer: ... +def allocate_buffer(*args, **kwargs): + """ + Allocate a mutable buffer. + + Parameters + ---------- + size : int + Number of bytes to allocate (plus internal padding) + memory_pool : MemoryPool, optional + The pool to allocate memory from. + If not given, the default memory pool is used. + resizable : bool, default False + If true, the returned buffer is resizable. + + Returns + ------- + buffer : Buffer or ResizableBuffer + """ + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + """ + An output stream that writes to a resizable buffer. + + The buffer is produced as a result when ``getvalue()`` is called. + + Examples + -------- + Create an output stream, write data to it and finalize it with + ``getvalue()``: + + >>> import pyarrow as pa + >>> f = pa.BufferOutputStream() + >>> f.write(b"pyarrow.Buffer") + 14 + >>> f.closed + False + >>> f.getvalue() + + >>> f.closed + True + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: + """ + Finalize output stream and return result as pyarrow.Buffer. + + Returns + ------- + value : Buffer + """ + +class MockOutputStream(NativeFile): ... + +class BufferReader(NativeFile): + """ + Zero-copy reader from objects convertible to Arrow buffer. + + Parameters + ---------- + obj : Python bytes or pyarrow.Buffer + + Examples + -------- + Create an Arrow input stream and inspect it: + + >>> import pyarrow as pa + >>> data = b"reader data" + >>> buf = memoryview(data) + >>> with pa.input_stream(buf) as stream: + ... stream.size() + ... stream.read(6) + ... stream.seek(7) + ... stream.read(15) + 11 + b'reader' + 7 + b'data' + """ + def __init__(self, obj) -> None: ... + +class CompressedInputStream(NativeFile): + """ + An input stream wrapper which decompresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + + Create an input stream with decompression referencing the + buffer with compressed data: + + >>> cdata = raw.getvalue() + >>> with pa.input_stream(cdata, compression="gzip") as compressed: + ... compressed.read() + b'Compressed stream' + + which actually translates to the use of ``BufferReader``and + ``CompressedInputStream``: + + >>> raw = pa.BufferReader(cdata) + >>> with pa.CompressedInputStream(raw, "gzip") as compressed: + ... compressed.read() + b'Compressed stream' + """ + + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class CompressedOutputStream(NativeFile): + """ + An output stream wrapper which compresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + """ + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class BufferedInputStream(NativeFile): + """ + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. + + Parameters + ---------- + stream : NativeFile + The input stream to wrap with the buffer + buffer_size : int + Size of the temporary read buffer. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Release the raw InputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw input stream + """ + +class BufferedOutputStream(NativeFile): + """ + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. + + Parameters + ---------- + stream : NativeFile + The writable output stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Flush any buffered writes and release the raw OutputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw output stream. + """ + +class TransformInputStream(NativeFile): + """ + Transform an input stream. + + Parameters + ---------- + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + then re-encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data. + dest_encoding : str + The codec to use for emitted data. + """ + +def py_buffer(obj: SupportPyBuffer) -> Buffer: + """ + Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. + """ + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: + """ + Construct an Arrow buffer with the given *address* and *size*. + + The buffer will be optionally backed by the Python *base* object, if given. + The *base* object will be kept alive as long as this buffer is alive, + including across language boundaries (for example if the buffer is + referenced by C++ code). + + Parameters + ---------- + address : int + The starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + The size of device buffer in bytes. + base : {None, object} + Object that owns the referenced memory. + """ + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + +class CacheOptions(_Weakrefable): + """ + Cache options for a pre-buffered fragment scan. + + Parameters + ---------- + hole_size_limit : int, default 8KiB + The maximum distance in bytes between two consecutive ranges; beyond + this value, ranges are not combined. + range_size_limit : int, default 32MiB + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, + they are not combined + lazy : bool, default True + lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + range that is currently being read. + prefetch_limit : int, default 0 + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target + range. + """ + + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: + """ + Create suitable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int, default 0.9 + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int, default 64 + The maximum single data request size (in MiB) to maximize the net data load. + + Returns + ------- + CacheOptions + """ + +class Codec(_Weakrefable): + """ + Compression codec. + + Parameters + ---------- + compression : str + Type of compression codec to initialize, valid values are: 'gzip', + 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and + 'snappy'. + compression_level : int, None + Optional parameter specifying how aggressively to compress. The + possible ranges and effect of this parameter depend on the specific + codec chosen. Higher values compress more but typically use more + resources (CPU/RAM). Some codecs support negative values. + + gzip + The compression_level maps to the memlevel parameter of + deflateInit2. Higher levels use more RAM but are faster + and should have higher compression ratios. + + bz2 + The compression level maps to the blockSize100k parameter of + the BZ2_bzCompressInit function. Higher levels use more RAM + but are faster and should have higher compression ratios. + + brotli + The compression level maps to the BROTLI_PARAM_QUALITY + parameter. Higher values are slower and should have higher + compression ratios. + + lz4/lz4_frame/lz4_raw + The compression level parameter is not supported and must + be None + + zstd + The compression level maps to the compressionLevel parameter + of ZSTD_initCStream. Negative values are supported. Higher + values are slower and should have higher compression ratios. + + snappy + The compression level parameter is not supported and must + be None + + + Raises + ------ + ValueError + If invalid compression value is passed. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.Codec.is_available("gzip") + True + >>> codec = pa.Codec("gzip") + >>> codec.name + 'gzip' + >>> codec.compression_level + 9 + """ + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + @classmethod + def detect(cls, path: StrPath) -> Self: + """ + Detect and instantiate compression codec based on file extension. + + Parameters + ---------- + path : str, path-like + File-path to detect compression from. + + Raises + ------ + TypeError + If the passed value is not path-like. + ValueError + If the compression can't be detected from the path. + + Returns + ------- + Codec + """ + @staticmethod + def is_available(compression: Compression) -> bool: + """ + Returns whether the compression support has been built and enabled. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + + Returns + ------- + bool + """ + @staticmethod + def supports_compression_level(compression: Compression) -> int: + """ + Returns true if the compression level parameter is supported + for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def default_compression_level(compression: Compression) -> int: + """ + Returns the compression level that Arrow will use for the codec if + None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def minimum_compression_level(compression: Compression) -> int: + """ + Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def maximum_compression_level(compression: Compression) -> int: + """ + Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @property + def name(self) -> Compression: + """Returns the name of the codec""" + @property + def compression_level(self) -> int: + """Returns the compression level parameter of the codec""" + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def compress(self, *args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def decompress(self, *args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + decompressed_size : int, default None + Size of the decompressed result + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def compress(*args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def decompress(*args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + Input object to decompress data from. + decompressed_size : int, default None + Size of the decompressed result + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +def input_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> BufferReader: + """ + Create an Arrow input stream. + + Parameters + ---------- + source : str, Path, buffer, or file-like object + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Examples + -------- + Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: + + >>> import pyarrow as pa + >>> buf = memoryview(b"some data") + >>> with pa.input_stream(buf) as stream: + ... stream.read(4) + b'some' + + Create a readable OSFile (NativeFile) from a string or file path: + + >>> import gzip + >>> with gzip.open("example.gz", "wb") as f: + ... f.write(b"some data") + 9 + >>> with pa.input_stream("example.gz") as stream: + ... stream.read() + b'some data' + + Create a readable PythonFile (NativeFile) from a a Python file object: + + >>> with open("example.txt", mode="w") as f: + ... f.write("some text") + 9 + >>> with pa.input_stream("example.txt") as stream: + ... stream.read(6) + b'some t' + """ + +def output_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> NativeFile: + """ + Create an Arrow output stream. + + Parameters + ---------- + source : str, Path, buffer, file-like object + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Examples + -------- + Create a writable NativeFile from a pyarrow Buffer: + + >>> import pyarrow as pa + >>> data = b"buffer data" + >>> empty_obj = bytearray(11) + >>> buf = pa.py_buffer(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read(6) + b'buffer' + + or from a memoryview object: + + >>> buf = memoryview(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read() + b'buffer data' + + Create a writable NativeFile from a string or file path: + + >>> with pa.output_stream("example_second.txt") as stream: + ... stream.write(b"Write some data") + 15 + >>> with pa.input_stream("example_second.txt") as stream: + ... stream.read() + b'Write some data' + """ + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/stubs/__lib_pxi/ipc.pyi b/python/stubs/__lib_pxi/ipc.pyi new file mode 100644 index 00000000000..3d72892061e --- /dev/null +++ b/python/stubs/__lib_pxi/ipc.pyi @@ -0,0 +1,705 @@ +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from .types import DictionaryMemo, KeyValueMetadata + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + +class WriteStats(NamedTuple): + """IPC write statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class ReadStats(NamedTuple): + """IPC read statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class IpcReadOptions(_Weakrefable): + """ + Serialization options for reading IPC format. + + Parameters + ---------- + ensure_native_endian : bool, default True + Whether to convert incoming data to platform-native endianness. + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like decompression + included_fields : list + If empty (the default), return all deserialized fields. + If non-empty, the values are the indices of fields to read on + the top-level schema + """ + + ensure_native_endian: bool + use_threads: bool + included_fields: list[int] + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + """ + Serialization options for the IPC format. + + Parameters + ---------- + metadata_version : MetadataVersion, default MetadataVersion.V5 + The metadata version to write. V5 is the current and latest, + V4 is the pre-1.0 metadata version (with incompatible Union layout). + allow_64bit : bool, default False + If true, allow field lengths that don't fit in a signed 32-bit int. + use_legacy_format : bool, default False + Whether to use the pre-Arrow 0.15 IPC format. + compression : str, Codec, or None + compression codec to use for record batch buffers. + If None then batch buffers will be uncompressed. + Must be "lz4", "zstd" or None. + To specify a compression_level use `pyarrow.Codec` + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like compression. + emit_dictionary_deltas : bool + Whether to emit dictionary deltas. Default is false for maximum + stream compatibility. + unify_dictionaries : bool + If true then calls to write_table will attempt to unify dictionaries + across all batches in the table. This can help avoid the need for + replacement dictionaries (which the file format does not support) + but requires computing the unified dictionary and then remapping + the indices arrays. + + This parameter is ignored when writing to the IPC stream format as + the IPC stream format can support replacement dictionaries. + """ + + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + +class Message(_Weakrefable): + """ + Container for an Arrow IPC message with metadata and optional body + """ + + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): + """ + Write message to generic OutputStream + + Parameters + ---------- + sink : NativeFile + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + """ + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write message as encapsulated IPC message + + Parameters + ---------- + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + """ + +class MessageReader(_Weakrefable): + """ + Interface for reading Message objects from some source (like an + InputStream) + """ + @classmethod + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: + """ + Open stream from source, if you want to use memory map use + MemoryMappedFile as source. + + Parameters + ---------- + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object + A readable source, like an InputStream + """ + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: + """ + Read next Message from the stream. + + Raises + ------ + StopIteration + At end of stream + """ + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + +class _CRecordBatchWriter(_Weakrefable): + """The base RecordBatchWriter wrapper. + + Provides common implementations of convenience methods. Should not + be instantiated directly by user code. + """ + def write(self, table_or_batch: Table | RecordBatch): + """ + Write RecordBatch or Table to stream. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + """ + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + custom_metadata : mapping or KeyValueMetadata + Keys and values must be string-like / coercible to bytes + """ + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: + """ + Current IPC write statistics. + """ + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + def __dealloc__(self) -> None: ... + def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: + """ + Read contents of stream to a pandas.DataFrame. + + Read all record batches as a pyarrow.Table then convert it to a + pandas.DataFrame using Table.to_pandas. + + Parameters + ---------- + **options + Arguments to forward to :meth:`Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + """ + +class RecordBatchReader(_Weakrefable): + """Base class for reading stream of record batches. + + Record batch readers function as iterators of record batches that also + provide the schema (without the need to get any batches). + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatchReader.from_*`` functions instead. + + Notes + ----- + To import and export using the Arrow C stream interface, use the + ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this + interface is intended for expert users. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([("x", pa.int64())]) + >>> def iter_record_batches(): + ... for i in range(2): + ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) + >>> print(reader.schema) + x: int64 + >>> for batch in reader: + ... print(batch) + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + """ + + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: + """ + Read next RecordBatch from the stream. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + RecordBatch + """ + __next__ = read_next_batch + @property + def schema(self) -> Schema: + """ + Shared schema of the record batches in the stream. + + Returns + ------- + Schema + """ + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: + """ + Read next RecordBatch from the stream along with its custom metadata. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: + """ + Iterate over record batches from the stream along with their custom + metadata. + + Yields + ------ + RecordBatchWithMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table. + + Returns + ------- + Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def close(self) -> None: + """ + Release any resources associated with the reader. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: + """ + Wrap this reader with one that casts each batch lazily as it is pulled. + Currently only a safe cast to target_schema is implemented. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + + Returns + ------- + RecordBatchReader + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowArrayStream struct, given its pointer. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + Be careful: if you don't pass the ArrowArrayStream struct to a + consumer, array memory will leak. This is a low-level function + intended for expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream struct, + given its pointer. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: + """ + Create RecordBatchReader from a Arrow-compatible stream object. + + This accepts objects implementing the Arrow PyCapsule Protocol for + streams, i.e. objects that have a ``__arrow_c_stream__`` method. + + Parameters + ---------- + data : Arrow-compatible stream object + Any object that implements the Arrow PyCapsule Protocol for + streams. + schema : Schema, default None + The schema to which the stream should be casted, if supported + by the stream object. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: + """ + Create RecordBatchReader from an iterable of batches. + + Parameters + ---------- + schema : Schema + The shared schema of the record batches + batches : Iterable[RecordBatch] + The batches that this reader will return. + + Returns + ------- + reader : RecordBatchReader + """ + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: + """ + Current IPC read statistics. + """ + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... + +class RecordBatchWithMetadata(NamedTuple): + """RecordBatch with its custom metadata + + Parameters + ---------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + + batch: RecordBatch + custom_metadata: KeyValueMetadata + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: + """ + The number of record batches in the IPC file. + """ + def get_batch(self, i: int) -> RecordBatch: + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: + """ + Read the record batch with the given index along with + its custom metadata + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + +def get_tensor_size(tensor: Tensor) -> int: + """ + Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. + """ + +def get_record_batch_size(batch: RecordBatch) -> int: + """ + Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. + """ + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: + """ + Write pyarrow.Tensor to pyarrow.NativeFile object its current position. + + Parameters + ---------- + tensor : pyarrow.Tensor + dest : pyarrow.NativeFile + + Returns + ------- + bytes_written : int + Total number of bytes written to the file + """ + +def read_tensor(source: NativeFile) -> Tensor: + """Read pyarrow.Tensor from pyarrow.NativeFile object from current + position. If the file source supports zero copy (e.g. a memory map), then + this operation does not allocate any memory. This function not assume that + the stream is aligned + + Parameters + ---------- + source : pyarrow.NativeFile + + Returns + ------- + tensor : Tensor + + """ + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: + """ + Read length-prefixed message from file or buffer-like object + + Parameters + ---------- + source : pyarrow.NativeFile, file-like object, or buffer-like object + + Returns + ------- + message : Message + """ + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: + """ + Read Schema from message or buffer + + Parameters + ---------- + obj : buffer or Message + dictionary_memo : DictionaryMemo, optional + Needed to be able to reconstruct dictionary-encoded fields + with read_record_batch + + Returns + ------- + schema : Schema + """ + +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: + """ + Read RecordBatch from message, given a known schema. If reading data from a + complete IPC stream, use ipc.open_stream instead + + Parameters + ---------- + obj : Message or Buffer-like + schema : Schema + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + + Returns + ------- + batch : RecordBatch + """ + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/stubs/__lib_pxi/memory.pyi b/python/stubs/__lib_pxi/memory.pyi new file mode 100644 index 00000000000..57a3bb4f1b3 --- /dev/null +++ b/python/stubs/__lib_pxi/memory.pyi @@ -0,0 +1,174 @@ +from pyarrow.lib import _Weakrefable + +class MemoryPool(_Weakrefable): + """ + Base class for memory allocation. + + Besides tracking its number of allocated bytes, a memory pool also + takes care of the required 64-byte alignment for Arrow data. + """ + + def release_unused(self) -> None: + """ + Attempt to return to the OS any memory being held onto by the pool. + + This function should not be called except potentially for + benchmarking or debugging as it could be expensive and detrimental to + performance. + + This is best effort and may not have any effect on some memory pools + or in some situations (e.g. fragmentation). + """ + def bytes_allocated(self) -> int: + """ + Return the number of bytes that are currently allocated from this + memory pool. + """ + def total_bytes_allocated(self) -> int: + """ + Return the total number of bytes that have been allocated from this + memory pool. + """ + def max_memory(self) -> int | None: + """ + Return the peak memory allocation in this memory pool. + This can be an approximate number in multi-threaded applications. + + None is returned if the pool implementation doesn't know how to + compute this number. + """ + def num_allocations(self) -> int: + """ + Return the number of allocations or reallocations that were made + using this memory pool. + """ + def print_stats(self) -> None: + """ + Print statistics about this memory pool. + + The output format is implementation-specific. Not all memory pools + implement this method. + """ + @property + def backend_name(self) -> str: + """ + The name of the backend used by this MemoryPool (e.g. "jemalloc"). + """ + +class LoggingMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): ... + +def default_memory_pool() -> MemoryPool: + """ + Return the process-global memory pool. + + Examples + -------- + >>> default_memory_pool() + + """ + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def system_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the C malloc heap. + """ + +def jemalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the jemalloc heap. + + NotImplementedError is raised if jemalloc support is not enabled. + """ + +def mimalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the mimalloc heap. + + NotImplementedError is raised if mimalloc support is not enabled. + """ + +def set_memory_pool(pool: MemoryPool) -> None: + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ + +def log_memory_allocations(enable: bool = True) -> None: + """ + Enable or disable memory allocator logging for debugging purposes + + Parameters + ---------- + enable : bool, default True + Pass False to disable logging + """ + +def total_allocated_bytes() -> int: + """ + Return the currently allocated bytes from the default memory pool. + Other memory pools may not be accounted for. + """ + +def jemalloc_set_decay_ms(decay_ms: int) -> None: + """ + Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of + milliseconds. A value of 0 (the default) results in dirty / muzzy memory + pages being released right away to the OS, while a higher value will result + in a time-based decay. See the jemalloc docs for more information + + It's best to set this at the start of your application. + + Parameters + ---------- + decay_ms : int + Number of milliseconds to set for jemalloc decay conf parameters. Note + that this change will only affect future memory arenas + """ + +def supported_memory_backends() -> list[str]: + """ + Return a list of available memory pool backends + """ + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi new file mode 100644 index 00000000000..0e80fae4ebf --- /dev/null +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -0,0 +1,51 @@ +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +import pandas as pd + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> pd.Series: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[pd.Categorical]: ... + @property + def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/stubs/__lib_pxi/scalar.pyi b/python/stubs/__lib_pxi/scalar.pyi new file mode 100644 index 00000000000..81ab5012067 --- /dev/null +++ b/python/stubs/__lib_pxi/scalar.pyi @@ -0,0 +1,1017 @@ +import collections.abc +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Generic, Iterator, Literal, Mapping, overload + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from typing_extensions import Protocol, TypeVar + +from . import types +from .types import ( + _AsPyType, + _DataTypeT, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) + +class Scalar(_Weakrefable, Generic[_DataType_co]): + """ + The base class for scalars. + """ + @property + def type(self) -> _DataType_co: + """ + Data type of the Scalar object. + """ + @property + def is_valid(self) -> bool: + """ + Holds a valid (non-null) value. + """ + @overload + def cast( + self, + target_type: None, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + @overload + def cast( + self, + target_type: _DataTypeT, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Scalar[_DataTypeT]: ... + def cast(self, *args, **kwargs): + """ + Cast scalar value to another data type. + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast scalar to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + scalar : A Scalar of the given target data type. + """ + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def equals(self, other: Scalar) -> bool: ... + def __hash__(self) -> int: ... + @overload + def as_py( + self: Scalar[types._BasicDataType[_AsPyType]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> _AsPyType: ... + @overload + def as_py( + self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[ + types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] + ] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, Any]]: ... + @overload + def as_py( + self: Scalar[types.StructType], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[str, Any]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, Any]]: ... + @overload + def as_py( + self: Scalar[Any], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> Any: ... + def as_py(self, *args, **kwargs): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + """ + +_NULL: TypeAlias = None +NA = _NULL + +class NullScalar(Scalar[types.NullType]): ... +class BooleanScalar(Scalar[types.BoolType]): ... +class UInt8Scalar(Scalar[types.UInt8Type]): ... +class Int8Scalar(Scalar[types.Int8Type]): ... +class UInt16Scalar(Scalar[types.UInt16Type]): ... +class Int16Scalar(Scalar[types.Int16Type]): ... +class UInt32Scalar(Scalar[types.Uint32Type]): ... +class Int32Scalar(Scalar[types.Int32Type]): ... +class UInt64Scalar(Scalar[types.UInt64Type]): ... +class Int64Scalar(Scalar[types.Int64Type]): ... +class HalfFloatScalar(Scalar[types.Float16Type]): ... +class FloatScalar(Scalar[types.Float32Type]): ... +class DoubleScalar(Scalar[types.Float64Type]): ... +class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... +class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... +class Date32Scalar(Scalar[types.Date32Type]): ... + +class Date64Scalar(Scalar[types.Date64Type]): + @property + def value(self) -> dt.date | None: ... + +class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): + @property + def value(self) -> dt.time | None: ... + +class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): + @property + def value(self) -> dt.time | None: ... + +class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): + @property + def value(self) -> int | None: ... + +class DurationScalar(Scalar[types.DurationType[_Unit]]): + @property + def value(self) -> dt.timedelta | None: ... + +class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): + @property + def value(self) -> MonthDayNano | None: ... + +class BinaryScalar(Scalar[types.BinaryType]): + def as_buffer(self) -> Buffer: ... + +class LargeBinaryScalar(Scalar[types.LargeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class StringScalar(Scalar[types.StringType]): + def as_buffer(self) -> Buffer: ... + +class LargeStringScalar(Scalar[types.LargeStringType]): + def as_buffer(self) -> Buffer: ... + +class BinaryViewScalar(Scalar[types.BinaryViewType]): + def as_buffer(self) -> Buffer: ... + +class StringViewScalar(Scalar[types.StringViewType]): + def as_buffer(self) -> Buffer: ... + +class ListScalar(Scalar[types.ListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[str]: ... + def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + +class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], + ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], + ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... + +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): + @property + def index(self) -> Scalar[types._IndexT]: ... + @property + def value(self) -> Scalar[types._BasicValueT]: ... + @property + def dictionary(self) -> Array: ... + +class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): + @property + def value(self) -> tuple[int, types._BasicValueT] | None: ... + +class UnionScalar(Scalar[types.UnionType]): + @property + def value(self) -> Any | None: ... + @property + def type_code(self) -> str: ... + +class ExtensionScalar(Scalar[types.ExtensionType]): + @property + def value(self) -> Any | None: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: + """ + Construct ExtensionScalar from type and storage value. + + Parameters + ---------- + typ : DataType + The extension type for the result scalar. + value : object + The storage value for the result scalar. + + Returns + ------- + ext_scalar : ExtensionScalar + """ + +class Bool8Scalar(Scalar[types.Bool8Type]): ... +class UuidScalar(Scalar[types.UuidType]): ... +class JsonScalar(Scalar[types.JsonType]): ... +class OpaqueScalar(Scalar[types.OpaqueType]): ... + +class FixedShapeTensorScalar(ExtensionScalar): + def to_numpy(self) -> np.ndarray: + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + +_V = TypeVar("_V") + +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... + +@overload +def scalar( + value: str, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: bytes, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: bool, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: int, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: float, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Decimal, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: dt.datetime, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.date, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: dt.time, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.timedelta, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[Literal["us"]]: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Mapping[str, Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: NullableCollection[str], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.StringType]]: ... +@overload +def scalar( + value: NullableCollection[bytes], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BinaryType]]: ... +@overload +def scalar( + value: NullableCollection[bool], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BoolType]]: ... +@overload +def scalar( + value: NullableCollection[int], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Int64Type]]: ... +@overload +def scalar( + value: NullableCollection[float], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Float64Type]]: ... +@overload +def scalar( + value: NullableCollection[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.datetime], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.date], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Date32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.time], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.timedelta], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[MonthDayNano], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... +@overload +def scalar( + value: NullableCollection[Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[Any]: ... +@overload +def scalar( + value: Any, + type: types.NullType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> NullScalar: ... +@overload +def scalar( + value: Any, + type: types.BoolType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: Any, + type: types.UInt8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt8Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Uint32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int32Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Float16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> HalfFloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Any, + type: types.Date32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Date64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date64Scalar: ... +@overload +def scalar( + value: Any, + type: types.MonthDayNanoIntervalType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Any, + type: types.StringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeStringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeStringScalar: ... +@overload +def scalar( + value: Any, + type: types.StringViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringViewScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeBinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryViewScalar: ... +@overload +def scalar( + value: Any, + type: types.TimestampType[types._Unit, types._Tz], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[types._Unit, types._Tz]: ... +@overload +def scalar( + value: Any, + type: types.Time32Type[types._Time32Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time32Scalar[types._Time32Unit]: ... +@overload +def scalar( + value: Any, + type: types.Time64Type[types._Time64Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[types._Time64Unit]: ... +@overload +def scalar( + value: Any, + type: types.DurationType[types._Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[types._Unit]: ... +@overload +def scalar( + value: Any, + type: types.Decimal32Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal32Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal64Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal64Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal128Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal256Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal256Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.ListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.ListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.FixedSizeListType[_DataTypeT, types._Size], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... +@overload +def scalar( + value: Any, + type: types.DictionaryType[types._IndexT, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.MapType[types._K, types._ValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MapScalar[types._K, types._ValueT]: ... +@overload +def scalar( + value: Any, + type: types.StructType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: Any, + type: types.UnionType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UnionScalar: ... +@overload +def scalar( + value: Any, + type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.Bool8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Bool8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UuidType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UuidScalar: ... +@overload +def scalar( + value: Any, + type: types.JsonType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> JsonScalar: ... +@overload +def scalar( + value: Any, + type: types.OpaqueType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> OpaqueScalar: ... +@overload +def scalar( + value: Any, + type: _DataTypeT, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT]: ... +def scalar(*args, **kwargs): + """ + Create a pyarrow.Scalar instance from a Python object. + + Parameters + ---------- + value : Any + Python object coercible to arrow's type system. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the value. + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. Defaults to False if not passed explicitly by user, + or True if a pandas object is passed in. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + scalar : pyarrow.Scalar + + Examples + -------- + >>> import pyarrow as pa + + >>> pa.scalar(42) + + + >>> pa.scalar("string") + + + >>> pa.scalar([1, 2]) + + + >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) + + """ + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi new file mode 100644 index 00000000000..ad9d0392137 --- /dev/null +++ b/python/stubs/__lib_pxi/table.pyi @@ -0,0 +1,5609 @@ +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import ( + Any, + Collection, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema + +from . import array, scalar, types +from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from .ipc import RecordBatchReader +from .scalar import Int64Scalar, Scalar +from .tensor import Tensor +from .types import _AsPyType, _BasicDataType, _DataTypeT + +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + An array-like composed from a (possibly empty) collection of pyarrow.Arrays + + Warnings + -------- + Do not call this class's constructor directly. + + Examples + -------- + To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: + + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) + True + """ + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: + """ + Return data type of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + """ + def length(self) -> int: + """ + Return length of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.length() + 6 + """ + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the ChunkedArray + + Parameters + ---------- + indent : int + How much to indent right the content of the array, + by default ``0``. + window : int + How many items to preview within each chunk at the begin and end + of the chunk when the chunk is bigger than the window. + The other elements will be ellipsed. + container_window : int + How many chunks to preview at the begin and end + of the array when the array is bigger than the window. + The other elements will be ellipsed. + This setting also applies to list columns. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_string(skip_new_lines=True) + '[[2,2,4],[4,5,100]]' + """ + format = to_string + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def null_count(self) -> int: + """ + Number of null entries + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.null_count + 1 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the chunked array. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.nbytes + 49 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the chunked array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.get_total_buffer_size() + 49 + """ + def __sizeof__(self) -> int: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or ChunkedArray (slice) + """ + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_null() + + [ + [ + false, + false, + false, + false, + true, + false + ] + ] + """ + def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the NaN values. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) + >>> arr.is_nan() + + [ + [ + false, + true, + false, + false, + null, + false + ] + ] + """ + def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the non-null values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_valid() + + [ + [ + true, + true, + true + ], + [ + true, + false, + true + ] + ] + """ + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + """ + Replace each null element in values with fill_value. + + See :func:`pyarrow.compute.fill_null` for full usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. + + Examples + -------- + >>> import pyarrow as pa + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.fill_null(fill_value) + + [ + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + ] + """ + def equals(self, other: Self) -> bool: + """ + Return whether the contents of two chunked arrays are equal. + + Parameters + ---------- + other : pyarrow.ChunkedArray + Chunked array to compare against. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> n_legs.equals(n_legs) + True + >>> n_legs.equals(animals) + False + """ + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + """ + Return a NumPy copy of this array (experimental). + + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + + Returns + ------- + array : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_numpy() + array([ 2, 2, 4, 4, 5, 100]) + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + @overload + def cast( + self, + target_type: None = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self: ... + @overload + def cast( + self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None + ) -> ChunkedArray[Scalar[_CastAs]]: ... + def cast(self, *args, **kwargs): + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + + Change the data type of an array: + + >>> n_legs_seconds = n_legs.cast(pa.duration("s")) + >>> n_legs_seconds.type + DurationType(duration[s]) + """ + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : ChunkedArray + A dictionary-encoded version of this array. + + Examples + -------- + >>> import pyarrow as pa + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> animals.dictionary_encode() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : list of ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> c_arr = pa.chunked_array(n_legs.value_counts()) + >>> c_arr + + [ + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + ] + >>> c_arr.flatten() + [ + [ + [ + 2, + 4, + 5, + 100 + ] + ], + [ + [ + 2, + 2, + 1, + 1 + ] + ]] + >>> c_arr.type + StructType(struct) + >>> n_legs.type + DataType(int64) + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: + """ + Flatten this ChunkedArray into a single non-chunked array. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.combine_chunks() + + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + """ + def unique(self) -> ChunkedArray[_Scalar_co]: + """ + Compute distinct elements in array + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.unique() + + [ + 2, + 4, + 5, + 100 + ] + """ + def value_counts(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + An array of structs + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.value_counts() + + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this ChunkedArray + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.slice(2, 2) + + [ + [ + 4 + ], + [ + 4 + ] + ] + """ + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> mask = pa.array([True, False, None, True, False, True]) + >>> n_legs.filter(mask) + + [ + [ + 2 + ], + [ + 4, + 100 + ] + ] + >>> n_legs.filter(mask, null_selection_behavior="emit_null") + + [ + [ + 2, + null + ], + [ + 4, + 100 + ] + ] + """ + @overload + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + @overload + def index( + self, + value: Scalar[_DataTypeT], + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.index(4) + + >>> n_legs.index(4, start=3) + + """ + def take(self, indices: Indices) -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.take([1, 4, 5]) + + [ + [ + 2, + 5, + 100 + ] + ] + """ + def drop_null(self) -> Self: + """ + Remove missing values from a chunked array. + See :func:`pyarrow.compute.drop_null` for full description. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.drop_null() + + [ + [ + 2, + 2 + ], + [ + 4, + 5, + 100 + ] + ] + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent chunked array, but where all + chunks share the same dictionary values. Dictionary indices are + transposed accordingly. + + If there are no dictionaries in the chunked array, it is returned + unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> c_arr + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ] + ] + >>> c_arr.unify_dictionaries() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + @property + def num_chunks(self) -> int: + """ + Number of underlying chunks. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.num_chunks + 2 + """ + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: + """ + Select a chunk by its index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.chunk(1) + + [ + 4, + 5, + 100 + ] + """ + @property + def chunks(self) -> list[Array[_Scalar_co]]: + """ + Convert to a list of single-chunked arrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.chunks + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ]] + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.NullScalar], + ) -> Generator[array.NullArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BooleanScalar], + ) -> Generator[array.BooleanArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt8Scalar], + ) -> Generator[array.UInt8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int8Scalar], + ) -> Generator[array.Int8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt16Scalar], + ) -> Generator[array.UInt16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int16Scalar], + ) -> Generator[array.Int16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt32Scalar], + ) -> Generator[array.UInt32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int32Scalar], + ) -> Generator[array.Int32Array, None, None]: + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt64Scalar], + ) -> Generator[array.UInt64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int64Scalar], + ) -> Generator[array.Int64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.HalfFloatScalar], + ) -> Generator[array.HalfFloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FloatScalar], + ) -> Generator[array.FloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DoubleScalar], + ) -> Generator[array.DoubleArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal32Scalar], + ) -> Generator[array.Decimal32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal64Scalar], + ) -> Generator[array.Decimal64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal128Scalar], + ) -> Generator[array.Decimal128Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal256Scalar], + ) -> Generator[array.Decimal256Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date32Scalar], + ) -> Generator[array.Date32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date64Scalar], + ) -> Generator[array.Date64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], + ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], + ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DurationScalar[types._Unit]], + ) -> Generator[array.DurationArray[types._Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], + ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryScalar], + ) -> Generator[array.BinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeBinaryScalar], + ) -> Generator[array.LargeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeBinaryScalar], + ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringScalar], + ) -> Generator[array.StringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeStringScalar], + ) -> Generator[array.LargeStringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryViewScalar], + ) -> Generator[array.BinaryViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringViewScalar], + ) -> Generator[array.StringViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.ListScalar[_DataTypeT]], + ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], + ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], + ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], + ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StructScalar], + ) -> Generator[array.StructArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], + ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], + ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.RunEndEncodedScalar], + ) -> Generator[array.RunEndEncodedArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UnionScalar], + ) -> Generator[array.UnionArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Bool8Scalar], + ) -> Generator[array.Bool8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UuidScalar], + ) -> Generator[array.UuidArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.JsonScalar], + ) -> Generator[array.JsonArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.OpaqueScalar], + ) -> Generator[array.OpaqueArray, None, None]: ... + def iterchunks(self): + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.to_pylist() + [2, 2, 4, 4, None, 100] + """ + def __arrow_c_stream__(self, requested_schema=None) -> Any: + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + @property + def is_cpu(self) -> bool: + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ + +@overload +def chunked_array( + values: Iterable[NullableCollection[bool]], + type: None = None, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[int]], + type: None = None, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[float]], + type: None = None, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[Decimal]], + type: None = None, +) -> ChunkedArray[scalar.Decimal128Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dict[str, Any]]], + type: None = None, +) -> ChunkedArray[scalar.StructScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.datetime]], + type: None = None, +) -> ChunkedArray[scalar.TimestampScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.date]], + type: None = None, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.time]], + type: None = None, +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.timedelta]], + type: None = None, +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[MonthDayNano]], + type: None = None, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[str]], + type: None = None, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[bytes]], + type: None = None, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[list[Any]]], + type: None = None, +) -> ChunkedArray[scalar.ListScalar[Any]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["null"] | types.NullType, +) -> ChunkedArray[scalar.NullScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["bool", "boolean"] | types.BoolType, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i1", "int8"] | types.Int8Type, +) -> ChunkedArray[scalar.Int8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i2", "int16"] | types.Int16Type, +) -> ChunkedArray[scalar.Int16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i4", "int32"] | types.Int32Type, +) -> ChunkedArray[scalar.Int32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i8", "int64"] | types.Int64Type, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> ChunkedArray[scalar.UInt8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> ChunkedArray[scalar.UInt16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> ChunkedArray[scalar.UInt32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> ChunkedArray[scalar.UInt64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> ChunkedArray[scalar.HalfFloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> ChunkedArray[scalar.FloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string", "str", "utf8"] | types.StringType, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary"] | types.BinaryType, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> ChunkedArray[scalar.LargeStringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_binary"] | types.LargeBinaryType, +) -> ChunkedArray[scalar.LargeBinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary_view"] | types.BinaryViewType, +) -> ChunkedArray[scalar.BinaryViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string_view"] | types.StringViewType, +) -> ChunkedArray[scalar.StringViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> ChunkedArray[scalar.Date64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[Array[_ScalarT]], + type: None = None, +) -> ChunkedArray[_ScalarT]: ... +def chunked_array(value, type=None): + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : Array, list of Array, or array-like + Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + """ + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + @overload + def __getitem__(self, key: int | str) -> _ColumnT: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return column at given index or column name + + Parameters + ---------- + key : integer, str, or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + Array (from RecordBatch) or ChunkedArray (from Table) for column input. + RecordBatch or Table for slice input. + """ + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: + """ + Select single column from Table or RecordBatch. + + Parameters + ---------- + i : int or string + The index or name of the column to retrieve. + + Returns + ------- + column : Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Select a column by numeric index: + + >>> table.column(0) + + [ + [ + 2, + 4, + 5, + 100 + ] + ] + + Select a column by its name: + + >>> table.column("animals") + + [ + [ + "Flamingo", + "Horse", + "Brittle stars", + "Centipede" + ] + ] + """ + @property + def column_names(self) -> list[str]: + """ + Names of the Table or RecordBatch columns. + + Returns + ------- + list of str + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=["n_legs", "animals"], + ... ) + >>> table.column_names + ['n_legs', 'animals'] + """ + @property + def columns(self) -> list[_ColumnT]: + """ + List of all columns in numerical order. + + Returns + ------- + columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.columns + [ + [ + [ + null, + 4, + 5, + null + ] + ], + [ + [ + "Flamingo", + "Horse", + null, + "Centipede" + ] + ]] + """ + def drop_null(self) -> Self: + """ + Remove rows that contain missing values from a Table or RecordBatch. + + See :func:`pyarrow.compute.drop_null` for full usage. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, with rows containing + no missing values. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [None, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", None, "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.drop_null() + pyarrow.Table + year: double + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def field(self, i: int | str) -> Field: + """ + Select a schema field by its column name or numeric index. + + Parameters + ---------- + i : int or string + The index or name of the field to retrieve. + + Returns + ------- + Field + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.field(0) + pyarrow.Field + >>> table.field(1) + pyarrow.Field + """ + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list | np.ndarray], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from Arrow arrays or columns. + + Parameters + ---------- + mapping : dict or Mapping + A mapping of strings to Arrays or Python lists. + schema : Schema, default None + If not passed, will be inferred from the Mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> pydict = {"n_legs": n_legs, "animals": animals} + + Construct a Table from a dictionary of arrays: + + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a dictionary of arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pydict(pydict, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from list of rows / dictionaries. + + Parameters + ---------- + mapping : list of dicts of rows + A mapping of strings to row values. + schema : Schema, default None + If not passed, will be inferred from the first row of the + mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + + Construct a Table from a list of rows: + + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4]] + animals: [["Flamingo","Dog"]] + + Construct a Table from a list of rows with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def itercolumns(self) -> Generator[_ColumnT, None, None]: + """ + Iterator over all columns in their numerical order. + + Yields + ------ + Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> for i in table.itercolumns(): + ... print(i.null_count) + 2 + 1 + """ + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (#rows, #columns). + + Returns + ------- + (int, int) + Number of rows and number of columns. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table.shape + (4, 2) + """ + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: + """ + Sort the Table or RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + Table or RecordBatch + A new tabular object sorted according to the sort keys. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.sort_by("animal") + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,2021,2021,2020,2022,2022]] + n_legs: [[5,100,4,2,4,2]] + animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] + """ + def take(self, indices: Indices) -> Self: + """ + Select rows from a Table or RecordBatch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the tabular object whose rows will be returned. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, containing the taken rows. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.take([1, 3]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def filter( + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: + """ + Select rows from the table or record batch based on a boolean mask. + + The Table can be filtered based on a mask, which will be passed to + :func:`pyarrow.compute.filter` to perform the filtering, or it can + be filtered through a boolean :class:`.Expression` + + Parameters + ---------- + mask : Array or array-like or .Expression + The boolean mask or the :class:`.Expression` to filter the table with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled, does nothing if + an :class:`.Expression` is used. + + Returns + ------- + filtered : Table or RecordBatch + A tabular object of the same schema, with only the rows selected + by applied filtering + + Examples + -------- + Using a Table (works similarly for RecordBatch): + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Define an expression and select rows: + + >>> import pyarrow.compute as pc + >>> expr = pc.field("year") <= 2020 + >>> table.filter(expr) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2019]] + n_legs: [[2,5]] + animals: [["Flamingo","Brittle stars"]] + + Define a mask and select rows: + + >>> mask = [True, True, False, None] + >>> table.filter(mask) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022]] + n_legs: [[2,4]] + animals: [["Flamingo","Horse"]] + >>> table.filter(mask, null_selection_behavior="emit_null") + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,null]] + n_legs: [[2,4,null]] + animals: [["Flamingo","Horse",null]] + """ + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list]: + """ + Convert the Table or RecordBatch to a dict or OrderedDict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + dict + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> table.to_pydict() + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + """ + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: + """ + Convert the Table or RecordBatch to a list of rows / dictionaries. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + list + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] + >>> table = pa.table(data, names=["n_legs", "animals"]) + >>> table.to_pylist() + [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... + """ + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: + """ + Return human-readable string representation of Table or RecordBatch. + + Parameters + ---------- + show_metadata : bool, default False + Display Field-level and Schema-level KeyValueMetadata. + preview_cols : int, default 0 + Display values of the columns for the first N columns. + + Returns + ------- + str + """ + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new Table or RecordBatch. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Raises + ------ + KeyError + If any of the passed column names do not exist. + + Returns + ------- + Table or RecordBatch + A tabular object without the column(s). + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Drop one column: + + >>> table.drop_columns("animals") + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + + Drop one or more columns: + + >>> table.drop_columns(["n_legs", "animals"]) + pyarrow.Table + ... + ---- + """ + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + def append_column(self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list) -> Self: + """ + Append column at end of columns. + + Parameters + ---------- + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + Table or RecordBatch + New table or record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Append column at the end: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.append_column("year", [year]) + pyarrow.Table + n_legs: int64 + animals: string + year: int64 + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + year: [[2021,2022,2019,2021]] + """ + +class RecordBatch(_Tabular[Array]): + """ + Batch of rows of columns of equal length + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatch.from_*`` functions instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Constructing a RecordBatch from arrays: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Constructing a RecordBatch from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_pandas(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Constructing a RecordBatch from pylist: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + >>> pa.RecordBatch.from_pylist(pylist).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Dog + + You can also construct a RecordBatch using :func:`pyarrow.record_batch`: + + >>> pa.record_batch([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of record batch by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + shallow_copy : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + + Constructing a RecordBatch with schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) + >>> batch.schema + n_legs: int64 + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Shallow copy of a RecordBatch with deleted schema metadata: + + >>> batch.replace_schema_metadata().schema + n_legs: int64 + """ + @property + def num_columns(self) -> int: + """ + Number of columns + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_columns + 2 + """ + + @property + def num_rows(self) -> int: + """ + Number of rows + + Due to the definition of a RecordBatch, all columns have the same + number of rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_rows + 6 + """ + @property + def schema(self) -> Schema: + """ + Schema of the RecordBatch and its columns + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.schema + n_legs: int64 + animals: string + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the record batch. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.nbytes + 116 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the record batch + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.get_total_buffer_size() + 120 + """ + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to RecordBatch at position i. + + A new record batch is returned with the column added, the original record batch + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.add_column(0, "year", year) + pyarrow.RecordBatch + year: int64 + n_legs: int64 + animals: string + ---- + year: [2021,2022,2019,2021] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Original record batch is left unchanged: + + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def remove_column(self, i: int) -> Self: + """ + Create new RecordBatch with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New record batch without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.remove_column(1) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: + """ + Replace column in RecordBatch at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.set_column(1, "year", year) + pyarrow.RecordBatch + n_legs: int64 + year: int64 + ---- + n_legs: [2,4,5,100] + year: [2021,2022,2019,2021] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new record batch with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> new_names = ["n", "name"] + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write RecordBatch to Buffer as encapsulated IPC message, which does not + include a Schema. + + To reconstruct a RecordBatch from the encapsulated IPC message Buffer + returned by this function, a Schema must be passed separately. See + Examples. + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> buf = batch.serialize() + >>> buf + + + Reconstruct RecordBatch from IPC message Buffer and original Schema + + >>> pa.ipc.read_record_batch(buf, batch.schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of record batch to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> batch.slice(offset=3).to_pandas() + n_legs animals + 0 4 Horse + 1 5 Brittle stars + 2 100 Centipede + >>> batch.slice(length=2).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + >>> batch.slice(offset=3, length=1).to_pandas() + n_legs animals + 0 4 Horse + """ + def equals(self, other: Self, check_metadata: bool = False) -> bool: + """ + Check if contents of two record batches are equal. + + Parameters + ---------- + other : pyarrow.RecordBatch + RecordBatch to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch_0 = pa.record_batch([]) + >>> batch_1 = pa.RecordBatch.from_arrays( + ... [n_legs, animals], + ... names=["n_legs", "animals"], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> batch.equals(batch) + True + >>> batch.equals(batch_0) + False + >>> batch.equals(batch_1) + True + >>> batch.equals(batch_1, check_metadata=True) + False + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + + Select columns my indices: + + >>> batch.select([1]) + pyarrow.RecordBatch + animals: string + ---- + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Select columns by names: + + >>> batch.select(["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,2,4,4,5,100] + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast record batch values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast batch values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> batch.cast(target_schema=my_schema) + pyarrow.RecordBatch + n_legs: duration[s] + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + arrays : list of pyarrow.Array + One for each field in RecordBatch + names : list of str, optional + Names for the batch fields. If not passed, schema must be passed + schema : Schema, default None + Schema for the created batch. If not passed, names must be passed + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from pyarrow Arrays using names: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Construct a RecordBatch from pyarrow Arrays using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the RecordBatch. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``RecordBatch``. The default of None will store the index as a + column, except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + + Returns + ------- + pyarrow.RecordBatch + + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Convert pandas DataFrame to RecordBatch: + + >>> import pyarrow as pa + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_pandas(df, schema=my_schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch specifying columns: + + >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a RecordBatch from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``RecordBatch``. + + Parameters + ---------- + struct_array : StructArray + Array to construct the record batch from. + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.RecordBatch.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array(self) -> StructArray: + """ + Convert to a struct array. + """ + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: + """ + Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], + ... names=["a", "b"], + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this schema. + If None, the batch will be returned as-is, with a schema matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the batch as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema + and ArrowArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowDeviceArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this data type. + If None, the batch will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a + C ArrowSchema and ArrowDeviceArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowDeviceArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the arrays in the RecordBatch reside. + + Returns + ------- + DeviceAllocationType + """ + @property + def is_cpu(self) -> bool: + """ + Whether the RecordBatch's arrays are CPU-accessible. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Copy the entire RecordBatch to destination device. + + This copies each column of the record batch to create + a new record batch where all underlying buffers for the columns have + been copied to the destination MemoryManager. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + RecordBatch + """ + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray[Any]]): + """ + A collection of top-level named, equal length Arrow arrays. + + Warnings + -------- + Do not call this class's constructor directly, use one of the ``from_*`` + methods instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a RecordBatch: + + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a dictionary of arrays: + + >>> pydict = {"n_legs": n_legs, "animals": animals} + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,null]] + animals: [["Flamingo","Centipede"]] + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [ + ... pa.field("year", pa.int64()), + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... ], + ... metadata={"year": "Year of entry"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + year: int64 + n_legs: int64 + animals: string + -- schema metadata -- + year: 'Year of entry' + + Construct a Table with :func:`pyarrow.table`: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + + def validate(self, *, full=False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def slice(self, offset=0, length=None) -> Self: + """ + Compute zero-copy slice of this Table. + + Parameters + ---------- + offset : int, default 0 + Offset from start of table to slice. + length : int, default None + Length of slice (default is until end of table starting from + offset). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.slice(length=3) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019]] + n_legs: [[2,4,5]] + animals: [["Flamingo","Horse","Brittle stars"]] + >>> table.slice(offset=2) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019,2021]] + n_legs: [[5,100]] + animals: [["Brittle stars","Centipede"]] + >>> table.slice(offset=2, length=1) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019]] + n_legs: [[5]] + animals: [["Brittle stars"]] + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the Table. + + Returns a new Table with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.select([0, 1]) + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + >>> table.select(["year"]) + pyarrow.Table + year: int64 + ---- + year: [[2020,2022,2019,2021]] + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None), + which deletes any existing metadata. + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Constructing a Table with pyarrow schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> table = pa.table(df, my_schema) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + + Create a shallow copy of a Table with deleted schema metadata: + + >>> table.replace_schema_metadata().schema + n_legs: int64 + animals: string + + Create a shallow copy of a Table with new schema metadata: + + >>> metadata = {"animals": "Which animal"} + >>> table.replace_schema_metadata(metadata=metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Which animal' + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Flatten this Table. + + Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> month = pa.array([4, 6]) + >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) + >>> table + pyarrow.Table + a: struct + child 0, animals: string + child 1, n_legs: int64 + child 2, year: int64 + month: int64 + ---- + a: [ + -- is_valid: all not null + -- child 0 type: string + ["Parrot",null] + -- child 1 type: int64 + [2,4] + -- child 2 type: int64 + [null,2022]] + month: [[4,6]] + + Flatten the columns with struct field: + + >>> table.flatten() + pyarrow.Table + a.animals: string + a.n_legs: int64 + a.year: int64 + month: int64 + ---- + a.animals: [["Parrot",null]] + a.n_legs: [[2,4]] + a.year: [[null,2022]] + month: [[4,6]] + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + >>> table.combine_chunks() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4,4,5,100]] + animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent table, but where all chunks of + each column share the same dictionary values. Dictionary indices + are transposed accordingly. + + Columns without dictionaries are returned unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> table = pa.table([c_arr], names=["animals"]) + >>> table + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog"] -- indices: + [0,1,2], -- dictionary: + ["Horse","Brittle stars","Centipede"] -- indices: + [0,1,2]] + + Unify dictionaries across both chunks: + + >>> table.unify_dictionaries() + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [0,1,2], -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [3,4,5]] + """ + def equals(self, other: Self, check_metadata: bool = False) -> Self: + """ + Check if contents of two tables are equal. + + Parameters + ---------- + other : pyarrow.Table + Table to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.Table.from_arrays([n_legs, animals], names=names) + >>> table_0 = pa.Table.from_arrays([]) + >>> table_1 = pa.Table.from_arrays( + ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> table.equals(table) + True + >>> table.equals(table_0) + False + >>> table.equals(table_1) + True + >>> table.equals(table_1, check_metadata=True) + False + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast table values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast table values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> table.cast(target_schema=my_schema) + pyarrow.Table + n_legs: duration[s] + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains None/nan objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``Table``. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + safe : bool, default True + Check for overflows or other unsafe conversions. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[ArrayOrChunkedArray[Any]], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table from Arrow arrays. + + Parameters + ---------- + arrays : list of pyarrow.Array or pyarrow.ChunkedArray + Equal-length arrays that should form the table. + names : list of str, optional + Names for the table columns. If not passed, schema must be passed. + schema : Schema, default None + Schema for the created table. If not passed, names must be passed. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"animals": "Name of the animal species"}, + ... ) + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Name of the animal species' + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a Table from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``Table``. + + Parameters + ---------- + struct_array : StructArray or ChunkedArray + Array to construct the table from. + + Returns + ------- + pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.Table.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[scalar.StructScalar]: + """ + Convert to a chunked array of struct type. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for ChunkedArray chunks. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + ChunkedArray + """ + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: + """ + Construct a Table from a sequence or iterator of Arrow RecordBatches. + + Parameters + ---------- + batches : sequence or iterator of RecordBatch + Sequence of RecordBatch to be converted, all schemas must be equal. + schema : Schema, default None + If not passed, will be inferred from the first RecordBatch. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Construct a Table from a RecordBatch: + + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a sequence of RecordBatches: + + >>> pa.Table.from_batches([batch, batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: + """ + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + list[RecordBatch] + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatch: + + >>> table.to_batches()[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Convert a Table to a list of RecordBatches: + + >>> table.to_batches(max_chunksize=2)[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + >>> table.to_batches(max_chunksize=2)[1].to_pandas() + n_legs animals + 0 5 Brittle stars + 1 100 Centipede + """ + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + RecordBatchReader + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatchReader: + + >>> table.to_reader() + + + >>> reader = table.to_reader() + >>> reader.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + >>> reader.read_all() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @property + def schema(self) -> Schema: + """ + Schema of the table and its columns. + + Returns + ------- + Schema + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... + """ + @property + def num_columns(self) -> int: + """ + Number of columns in this table. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_columns + 2 + """ + @property + def num_rows(self) -> int: + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_rows + 4 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the table. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.nbytes + 72 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the table. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.get_total_buffer_size() + 76 + """ + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.add_column(0, "year", [year]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2021,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Original table is left unchanged: + + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def remove_column(self, i: int) -> Self: + """ + Create new Table with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New table without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.remove_column(1) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + """ + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Replace column in Table at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.set_column(1, "year", [year]) + pyarrow.Table + n_legs: int64 + year: int64 + ---- + n_legs: [[2,4,5,100]] + year: [[2021,2022,2019,2021]] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new table with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> new_names = ["n", "name"] + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def drop(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new table. + + Alias of Table.drop_columns, but kept for backwards compatibility. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Returns + ------- + Table + New table without the column(s). + """ + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: + """ + Declare a grouping over the columns of the table. + + Resulting grouping can then be used to perform aggregations + with a subsequent ``aggregate()`` method. + + Parameters + ---------- + keys : str or list[str] + Name of the columns that should be used as the grouping key. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the + default), no stable ordering of the output is guaranteed. + + Returns + ------- + TableGroupBy + + See Also + -------- + TableGroupBy.aggregate + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.group_by("year").aggregate([("n_legs", "sum")]) + pyarrow.Table + year: int64 + n_legs_sum: int64 + ---- + year: [[2020,2022,2021,2019]] + n_legs_sum: [[2,6,104,5]] + """ + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: + """ + Perform a join between this table and another one. + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + keys : str or list[str] + The columns from current table that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right tables have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right tables have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whether to use multithreading or not. + + Returns + ------- + Table + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) + >>> df2 = pd.DataFrame( + ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} + ... ) + >>> t1 = pa.Table.from_pandas(df1) + >>> t2 = pa.Table.from_pandas(df2) + + Left outer join: + + >>> t1.join(t2, "id").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2]] + year: [[2019,2020,2022]] + n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] + + Full outer join: + + >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2,4]] + year: [[2019,2020,2022,null]] + n_legs: [[5,null,null,100]] + animal: [["Brittle stars",null,null,"Centipede"]] + + Right outer join: + + >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") + pyarrow.Table + year: int64 + id: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,null]] + id: [[3,4]] + n_legs: [[5,100]] + animal: [["Brittle stars","Centipede"]] + + Right anti join + + >>> t1.join(t2, "id", join_type="right anti") + pyarrow.Table + id: int64 + n_legs: int64 + animal: string + ---- + id: [[4]] + n_legs: [[100]] + animal: [["Centipede"]] + """ + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: + """ + Perform an asof join between this table and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both tables must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + on : str + The column from current table that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current table that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row ``right.on - left.on <= tolerance``. The + ``tolerance`` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_table that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left table. + right_by : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + + Returns + ------- + Table + + Example + -------- + >>> import pyarrow as pa + >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) + >>> t2 = pa.table( + ... { + ... "id": [3, 4], + ... "year": [2020, 2021], + ... "n_legs": [5, 100], + ... "animal": ["Brittle stars", "Centipede"], + ... } + ... ) + + >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[1,3,2,3,3]] + year: [[2020,2021,2022,2022,2023]] + n_legs: [[null,5,null,5,null]] + animal: [[null,"Brittle stars",null,"Brittle stars",null]] + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the table as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @property + def is_cpu(self) -> bool: + """ + Whether all ChunkedArrays are CPU-accessible. + """ + +def record_batch( + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, +) -> RecordBatch: + """ + Create a pyarrow.RecordBatch from another Python data structure or sequence + of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of Arrays, + a pandas DataFame, or any tabular object implementing the + Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or + ``__arrow_c_device_array__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the RecordBatch. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + + Returns + ------- + RecordBatch + + See Also + -------- + RecordBatch.from_arrays, RecordBatch.from_pandas, table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from a python dictionary: + + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Creating a RecordBatch from a list of arrays with names: + + >>> pa.record_batch([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Creating a RecordBatch from a list of arrays with names and metadata: + + >>> my_metadata = {"n_legs": "How many legs does an animal have?"} + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'How many legs does an animal have?' + + Creating a RecordBatch from a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + >>> pa.record_batch(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Creating a RecordBatch from a pandas DataFrame with schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.record_batch(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + >>> pa.record_batch(df, my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + """ + +@overload +def table( + data: dict[str, list[Any] | Array[Any]], + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, + nthreads: int | None = None, +) -> Table: ... +@overload +def table( + data: Collection[ArrayOrChunkedArray[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, + nthreads: int | None = None, +) -> Table: ... +def table(*args, **kwargs): + """ + Create a pyarrow.Table from a Python data structure or sequence of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of arrays or + chunked arrays, a pandas DataFame, or any tabular object implementing + the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, + ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the Arrow Table. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + If passed, the output will have exactly this schema (raising an error + when columns are not found in the data and ignoring additional data not + specified in the schema, when data is a dict or DataFrame). + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + nthreads : int, default None + For pandas.DataFrame inputs: if greater than 1, convert columns to + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). + + Returns + ------- + Table + + See Also + -------- + Table.from_arrays, Table.from_pandas, Table.from_pydict + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from a python dictionary: + + >>> pa.table({"n_legs": n_legs, "animals": animals}) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.table(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.table(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... + + Construct a Table from chunked arrays: + + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + """ + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: + """ + Concatenate pyarrow.Table objects. + + If promote_options="none", a zero-copy concatenation will be performed. The schemas + of all the Tables must be the same (except the metadata), otherwise an + exception will be raised. The result Table will share the metadata with the + first table. + + If promote_options="default", any null type arrays will be casted to the type of other + arrays in the column of the same name. If a table is missing a particular + field, null values of the appropriate type will be generated to take the + place of the missing field. The new schema will share the metadata with the + first table. Each field in the new schema will share the metadata with the + first table which has the field defined. Note that type promotions may + involve additional allocations on the given ``memory_pool``. + + If promote_options="permissive", the behavior of default plus types will be promoted + to the common denominator that fits all the fields. + + Parameters + ---------- + tables : iterable of pyarrow.Table objects + Pyarrow tables to concatenate into a single Table. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + promote_options : str, default none + Accepts strings "none", "default" and "permissive". + **kwargs : dict, optional + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) + >>> pa.concat_tables([t1, t2]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] + + """ + +class TableGroupBy: + """ + A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the default), + no stable ordering of the output is guaranteed. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table( + ... [ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], + ... names=["keys", "values"], + ... ) + + Grouping of columns: + + >>> pa.TableGroupBy(t, "keys") + + + Perform aggregations: + + >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + """ + + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: + """ + Perform an aggregation over the grouped columns of the table. + + Parameters + ---------- + aggregations : list[tuple(str, str)] or \ +list[tuple(str, str, FunctionOptions)] + List of tuples, where each tuple is one aggregation specification + and consists of: aggregation column name followed + by function name and optionally aggregation function option. + Pass empty list to get a single row for each group. + The column name can be a string, an empty list or a list of + column names, for unary, nullary and n-ary aggregation functions + respectively. + + For the list of function names and respective aggregation + function options see :ref:`py-grouped-aggrs`. + + Returns + ------- + Table + Results of the aggregation functions. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + + Sum the column "values" over the grouped column "keys": + + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + + Count the rows over the grouped column "keys": + + >>> t.group_by("keys").aggregate([([], "count_all")]) + pyarrow.Table + keys: string + count_all: int64 + ---- + keys: [["a","b","c"]] + count_all: [[2,2,1]] + + Do multiple aggregations: + + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + keys: string + values_sum: int64 + keys_count: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + + Count the number of non-null values for column "values" + over the grouped column "keys": + + >>> import pyarrow.compute as pc + >>> t.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + keys: string + values_count: int64 + ---- + keys: [["a","b","c"]] + values_count: [[2,2,1]] + + Get a single row for each group in column "keys": + + >>> t.group_by("keys").aggregate([]) + pyarrow.Table + keys: string + ---- + keys: [["a","b","c"]] + """ + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.record_batch( + ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] + ... ) + >>> pa.concat_batches([t1, t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", +] diff --git a/python/stubs/__lib_pxi/tensor.pyi b/python/stubs/__lib_pxi/tensor.pyi new file mode 100644 index 00000000000..d849abd0f1f --- /dev/null +++ b/python/stubs/__lib_pxi/tensor.pyi @@ -0,0 +1,688 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +import numpy as np + +from pyarrow.lib import _Weakrefable +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO + +class Tensor(_Weakrefable): + """ + A n-dimensional array a.k.a Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + + @classmethod + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + Names of each dimension of the Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + def to_numpy(self) -> np.ndarray: + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.to_numpy() + array([[ 2, 2, 4], + [ 4, 5, 100]], dtype=int32) + """ + def equals(self, other: Tensor) -> bool: + """ + Return true if the tensors contains exactly equal data. + + Parameters + ---------- + other : Tensor + The other tensor to compare for equality. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) + >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) + >>> tensor.equals(tensor) + True + >>> tensor.equals(tensor2) + False + """ + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_name(0) + 'dim1' + >>> tensor.dim_name(1) + 'dim2' + """ + @property + def dim_names(self) -> list[str]: + """ + Names of this tensor dimensions. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_names + ['dim1', 'dim2'] + """ + @property + def is_mutable(self) -> bool: + """ + Is this tensor mutable or immutable. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_mutable + True + """ + @property + def is_contiguous(self) -> bool: + """ + Is this tensor contiguous in memory. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_contiguous + True + """ + @property + def ndim(self) -> int: + """ + The dimension (n) of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.ndim + 2 + """ + @property + def size(self) -> str: + """ + The size of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.size + 6 + """ + @property + def shape(self) -> tuple[int, ...]: + """ + The shape of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.shape + (2, 3) + """ + @property + def strides(self) -> tuple[int, ...]: + """ + Strides of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.strides + (12, 4) + """ + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCOOTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCOOTensor + """ + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The sparse multidimensional array that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> coo_matrix: + """ + Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + """ + def to_pydata_sparse(self) -> COO: + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCOOTensor to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCOOTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + +class SparseCSRMatrix(_Weakrefable): + """ + A sparse CSR matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The dense numpy array that should be converted. + dim_names : list, optional + The names of the dimensions. + + Returns + ------- + pyarrow.SparseCSRMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSRMatrix to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCSRMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSCMatrix(_Weakrefable): + """ + A sparse CSC matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSCMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSCMatrix to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSCMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSFTensor(_Weakrefable): + """ + A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSFTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSFTensor + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse tensor. + indptr : numpy.ndarray + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. + indices : numpy.ndarray + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. + shape : tuple + Shape of the matrix. + axis_order : list, optional + the sequence in which dimensions were traversed to + produce the prefix tree. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSFTensor to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSFTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi new file mode 100644 index 00000000000..7fe6c36e332 --- /dev/null +++ b/python/stubs/__lib_pxi/types.pyi @@ -0,0 +1,4413 @@ +import datetime as dt +import sys + +from collections.abc import Mapping, Sequence +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Iterable, Iterator, Literal, overload + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + +class _Weakrefable: ... +class _Metadata(_Weakrefable): ... + +class DataType(_Weakrefable): + """ + Base class of all Arrow data types. + + Each data type is an *instance* of this class. + + Examples + -------- + Instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + """ + def field(self, i: int) -> Field: + """ + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + """ + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: + """ + Bit width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().bit_width + 64 + """ + @property + def byte_width(self) -> int: + """ + Byte width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().byte_width + 8 + """ + @property + def num_fields(self) -> int: + """ + The number of child fields. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().num_fields + 0 + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.string()).num_fields + 1 + >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct.num_fields + 2 + """ + @property + def num_buffers(self) -> int: + """ + Number of data buffers required to construct Array type + excluding children. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().num_buffers + 2 + >>> pa.string().num_buffers + 3 + """ + def __hash__(self) -> int: ... + def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: + """ + Return true if type is equivalent to passed value. + + Parameters + ---------- + other : DataType or string convertible to DataType + check_metadata : bool + Whether nested Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().equals(pa.string()) + False + >>> pa.int64().equals(pa.int64()) + True + """ + def to_pandas_dtype(self) -> np.generic: + """ + Return the equivalent NumPy / Pandas dtype. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().to_pandas_dtype() + + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import DataType from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a DataType from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class _BasicDataType(DataType, Generic[_AsPyType]): ... +class NullType(_BasicDataType[None]): ... +class BoolType(_BasicDataType[bool]): ... +class UInt8Type(_BasicDataType[int]): ... +class Int8Type(_BasicDataType[int]): ... +class UInt16Type(_BasicDataType[int]): ... +class Int16Type(_BasicDataType[int]): ... +class Uint32Type(_BasicDataType[int]): ... +class Int32Type(_BasicDataType[int]): ... +class UInt64Type(_BasicDataType[int]): ... +class Int64Type(_BasicDataType[int]): ... +class Float16Type(_BasicDataType[float]): ... +class Float32Type(_BasicDataType[float]): ... +class Float64Type(_BasicDataType[float]): ... +class Date32Type(_BasicDataType[dt.date]): ... +class Date64Type(_BasicDataType[dt.date]): ... +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... +class StringType(_BasicDataType[str]): ... +class LargeStringType(_BasicDataType[str]): ... +class StringViewType(_BasicDataType[str]): ... +class BinaryType(_BasicDataType[bytes]): ... +class LargeBinaryType(_BasicDataType[bytes]): ... +class BinaryViewType(_BasicDataType[bytes]): ... + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + """ + Concrete class for timestamp data types. + + Examples + -------- + >>> import pyarrow as pa + + Create an instance of timestamp type: + + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + + Create an instance of timestamp type with timezone: + + >>> pa.timestamp("s", tz="UTC") + TimestampType(timestamp[s, tz=UTC]) + """ + @property + def unit(self) -> _Unit: + """ + The timestamp unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("us") + >>> t.unit + 'us' + """ + @property + def tz(self) -> _Tz: + """ + The timestamp time zone, if any, or None. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("s", tz="UTC") + >>> t.tz + 'UTC' + """ + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + """ + Concrete class for time32 data types. + + Supported time unit resolutions are 's' [second] + and 'ms' [millisecond]. + + Examples + -------- + Create an instance of time32 type: + + >>> import pyarrow as pa + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + @property + def unit(self) -> _Time32Unit: + """ + The time unit ('s' or 'ms'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time32("ms") + >>> t.unit + 'ms' + """ + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + """ + Concrete class for time64 data types. + + Supported time unit resolutions are 'us' [microsecond] + and 'ns' [nanosecond]. + + Examples + -------- + Create an instance of time64 type: + + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + """ + @property + def unit(self) -> _Time64Unit: + """ + The time unit ('us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time64("us") + >>> t.unit + 'us' + """ + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + """ + Concrete class for duration data types. + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("s") + DurationType(duration[s]) + """ + @property + def unit(self) -> _Unit: + """ + The duration unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.duration("s") + >>> t.unit + 's' + """ + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + """ + Concrete class for fixed-size binary data types. + + Examples + -------- + Create an instance of fixed-size binary type: + + >>> import pyarrow as pa + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + """ + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal32 data types. + + Examples + -------- + Create an instance of decimal32 type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.scale + 2 + """ + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal64 data types. + + Examples + -------- + Create an instance of decimal64 type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.scale + 2 + """ + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal128 data types. + + Examples + -------- + Create an instance of decimal128 type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.scale + 2 + """ + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal256 data types. + + Examples + -------- + Create an instance of decimal256 type: + + >>> import pyarrow as pa + >>> pa.decimal256(76, 38) + Decimal256Type(decimal256(76, 38)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.precision + 76 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.scale + 38 + """ + +class ListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list data types. + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_type + DataType(string) + """ + +class LargeListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list data types + (like ListType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.string()) + LargeListType(large_list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list(pa.string()).value_type + DataType(string) + """ + +class ListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + +class LargeListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + """ + Concrete class for fixed size list data types. + + Examples + -------- + Create an instance of FixedSizeListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_type + DataType(int32) + """ + @property + def list_size(self) -> _Size: + """ + The size of the fixed size lists. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).list_size + 2 + """ + +class DictionaryMemo(_Weakrefable): + """ + Tracking container for dictionary-encoded fields. + """ + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + Uint32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + """ + Concrete class for dictionary data types. + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + """ + + @property + def ordered(self) -> _Ordered: + """ + Whether the dictionary is ordered, i.e. whether the ordering of values + in the dictionary is important. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()).ordered + False + """ + @property + def index_type(self) -> _IndexT: + """ + The data type of dictionary indices (a signed integer type). + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).index_type + DataType(int16) + """ + @property + def value_type(self) -> _BasicValueT: + """ + The dictionary value type. + + The dictionary values are found in an instance of DictionaryArray. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).value_type + DataType(string) + """ + +_K = TypeVar("_K", bound=DataType) + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + """ + Concrete class for map data types. + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + """ + + @property + def key_field(self) -> Field[_K]: + """ + The field for keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_field + pyarrow.Field + """ + @property + def key_type(self) -> _K: + """ + The data type of keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_type + DataType(string) + """ + @property + def item_field(self) -> Field[_ValueT]: + """ + The field for items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_field + pyarrow.Field + """ + @property + def item_type(self) -> _ValueT: + """ + The data type of items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_type + DataType(int32) + """ + @property + def keys_sorted(self) -> _Ordered: + """ + Should the entries be sorted according to keys. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted + True + """ + +_Size = TypeVar("_Size", default=int) + +class StructType(DataType): + """ + Concrete class for struct data types. + + ``StructType`` supports direct indexing using ``[...]`` (implemented via + ``__getitem__``) to access its fields. + It will return the struct field with the given index or name. + + Examples + -------- + >>> import pyarrow as pa + + Accessing fields using direct indexing: + + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type[0] + pyarrow.Field + >>> struct_type["y"] + pyarrow.Field + + Accessing fields using ``field()``: + + >>> struct_type.field(1) + pyarrow.Field + >>> struct_type.field("x") + pyarrow.Field + + # Creating a schema from the struct type's fields: + >>> pa.schema(list(struct_type)) + x: int32 + y: string + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Index of the field with a name 'y': + + >>> struct_type.get_field_index("y") + 1 + + Index of the field that does not exist: + + >>> struct_type.get_field_index("z") + -1 + """ + def field(self, i: int | str) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or str + + Returns + ------- + pyarrow.Field + + Examples + -------- + + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Select the second field: + + >>> struct_type.field(1) + pyarrow.Field + + Select the field named 'x': + + >>> struct_type.field("x") + pyarrow.Field + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type.get_all_field_indices("x") + [0] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + @property + def names(self) -> list[str]: + """ + Lists the field names. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.names + ['a', 'b', 'c'] + """ + @property + def fields(self) -> list[Field]: + """ + Lists all fields within the StructType. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.fields + [pyarrow.Field, pyarrow.Field, pyarrow.Field] + """ + +class UnionType(DataType): + """ + Base class for union data types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + + Create an instance of a sparse UnionType using ``pa.union``: + + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse", "dense"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.mode + 'sparse' + """ + @property + def type_codes(self) -> list[int]: + """ + The type code to indicate each data type in this union. + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.type_codes + [0, 1] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + def field(self, i: int) -> Field: + """ + Return a child field by its numeric index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union[0] + pyarrow.Field + """ + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + +class SparseUnionType(UnionType): + """ + Concrete class for sparse union types. + + Examples + -------- + Create an instance of a sparse UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse"]: ... + +class DenseUnionType(UnionType): + """ + Concrete class for dense union types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + """ + + @property + def mode(self) -> Literal["dense"]: ... + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + """ + Concrete class for run-end encoded types. + """ + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + +class BaseExtensionType(DataType): + """ + Concrete base class for extension types. + """ + def __arrow_ext_class__(self) -> type[ExtensionArray]: + """ + The associated array extension class + """ + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: + """ + The associated scalar class + """ + @property + def extension_name(self) -> str: + """ + The extension type name. + """ + @property + def storage_type(self) -> DataType: + """ + The underlying storage type. + """ + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + +class ExtensionType(BaseExtensionType): + """ + Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + The underlying storage type for the extension type. + extension_name : str + A unique name distinguishing this extension type. The name will be + used when deserializing IPC data. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Create an instance of RationalType extension type: + + >>> rational_type = RationalType(pa.int32()) + + Inspect the extension type: + + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + Wrap an array as an extension array: + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, + ... ) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Or do the same with creating an ExtensionArray: + + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. + """ + + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + def __arrow_ext_serialize__(self) -> bytes: + """ + Serialized representation of metadata to reconstruct the type object. + + This method should return a bytes object, and those serialized bytes + are stored in the custom metadata of the Field holding an extension + type in an IPC message. + The bytes are passed to ``__arrow_ext_deserialize`` and should hold + sufficient information to reconstruct the data type instance. + """ + @classmethod + def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: + """ + Return an extension type instance from the storage type and serialized + metadata. + + This method should return an instance of the ExtensionType subclass + that matches the passed storage type and serialized metadata (the + return value of ``__arrow_ext_serialize__``). + """ + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + """ + Concrete class for fixed shape tensor extension type. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) + FixedShapeTensorType(extension) + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + @property + def value_type(self) -> _ValueT: + """ + Data type of an individual tensor. + """ + @property + def shape(self) -> list[int]: + """ + Shape of the tensors. + """ + @property + def dim_names(self) -> list[str] | None: + """ + Explicit names of the dimensions. + """ + @property + def permutation(self) -> list[int] | None: + """ + Indices of the dimensions ordering. + """ + +class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + +class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + +class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class OpaqueType(BaseExtensionType): + """ + Concrete class for opaque extension type. + + Opaque is a placeholder for a type from an external (often non-Arrow) + system that could not be interpreted. + + Examples + -------- + Create an instance of opaque extension type: + + >>> import pyarrow as pa + >>> pa.opaque(pa.int32(), "geometry", "postgis") + OpaqueType(extension) + """ + @property + def type_name(self) -> str: + """ + The name of the type in the external system. + """ + @property + def vendor_name(self) -> str: + """ + The name of the external system. + """ + +@deprecated( + "This class is deprecated and its deserialization is disabled by default. " + ":class:`ExtensionType` is recommended instead." +) +class PyExtensionType(ExtensionType): + """ + Concrete base class for Python-defined extension types based on pickle + for (de)serialization. + + .. warning:: + This class is deprecated and its deserialization is disabled by default. + :class:`ExtensionType` is recommended instead. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + """ + def __init__(self, storage_type: DataType) -> None: ... + @classmethod + def set_auto_load(cls, value: bool) -> None: + """ + Enable or disable auto-loading of serialized PyExtensionType instances. + + Parameters + ---------- + value : bool + Whether to enable auto-loading. + """ + +class UnknownExtensionType(PyExtensionType): # type: ignore + """ + A concrete class for Python-defined extension types that refer to + an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. + """ + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + +def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore + """ + Register a Python extension type. + + Registration is based on the extension name (so different registered types + need unique extension names). Registration needs an extension type + instance, but then works for any instance of the same subclass regardless + of parametrization of the type. + + Parameters + ---------- + ext_type : BaseExtensionType instance + The ExtensionType subclass to register. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +def unregister_extension_type(type_name: str) -> None: + """ + Unregister a Python extension type. + + Parameters + ---------- + type_name : str + The name of the ExtensionType subclass to unregister. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + """ + KeyValueMetadata + + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ + def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... + def equals(self, other: KeyValueMetadata) -> bool: ... + def __len__(self) -> int: ... + def __contains__(self, __key: object) -> bool: ... + def __getitem__(self, __key: Any) -> Any: ... + def __iter__(self) -> Iterator[bytes]: ... + def get_all(self, key: str) -> list[bytes]: ... + def to_dict(self) -> dict[bytes, bytes]: + """ + Convert KeyValueMetadata to dict. If a key occurs twice, the value for + the first one is returned + """ + +def ensure_metadata( + meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False +) -> KeyValueMetadata | None: ... + +class Field(_Weakrefable, Generic[_DataTypeT]): + """ + A named field, with a data type, nullability, and optional metadata. + + Notes + ----- + Do not use this class's constructor directly; use pyarrow.field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + """ + + def equals(self, other: Field, check_metadata: bool = False) -> bool: + """ + Test if this field is equal to the other + + Parameters + ---------- + other : pyarrow.Field + check_metadata : bool, default False + Whether Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.equals(f2) + False + >>> f1.equals(f1) + True + """ + def __hash__(self) -> int: ... + @property + def nullable(self) -> bool: + """ + The field nullability. + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.nullable + True + >>> f2.nullable + False + """ + @property + def name(self) -> str: + """ + The field name. + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field.name + 'key' + """ + @property + def metadata(self) -> dict[bytes, bytes] | None: + """ + The field metadata (if any is set). + + Returns + ------- + metadata : dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + """ + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: + """ + Add metadata as dict of string keys and values to Field + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + + Create new field by adding metadata to existing one: + + >>> field_new = field.with_metadata({"key": "Something important"}) + >>> field_new + pyarrow.Field + >>> field_new.metadata + {b'key': b'Something important'} + """ + def remove_metadata(self) -> Self: + """ + Create new field without metadata, if any + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + + Create new field by removing the metadata from the existing one: + + >>> field_new = field.remove_metadata() + >>> field_new.metadata + """ + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced type + + Parameters + ---------- + new_type : pyarrow.DataType + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing type of an existing one: + + >>> field_new = field.with_type(pa.int64()) + >>> field_new + pyarrow.Field + """ + def with_name(self, name: str) -> Self: + """ + A copy of this field with the replaced name + + Parameters + ---------- + name : str + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing the name of an existing one: + + >>> field_new = field.with_name("lock") + >>> field_new + pyarrow.Field + """ + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced nullability + + Parameters + ---------- + nullable : bool + + Returns + ------- + field: pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + >>> field.nullable + True + + Create new field by replacing the nullability of an existing one: + + >>> field_new = field.with_nullable(False) + >>> field_new + pyarrow.Field + >>> field_new.nullable + False + """ + def flatten(self) -> list[Field]: + """ + Flatten this field. If a struct field, individual child fields + will be returned with their names prefixed by the parent's name. + + Returns + ------- + fields : List[pyarrow.Field] + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("bar", pa.float64(), nullable=False) + >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) + >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) + + Flatten a struct field: + + >>> ff + pyarrow.Field not null> + >>> ff.flatten() + [pyarrow.Field, pyarrow.Field] + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import Field from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a Field from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class Schema(_Weakrefable): + """ + A named collection of types a.k.a schema. A schema defines the + column names and types in a record batch or table data structure. + They also contain metadata about the columns. For example, schemas + converted from Pandas contain metadata about their original Pandas + types so they can be converted back to the same types. + + Warnings + -------- + Do not call this class's constructor directly. Instead use + :func:`pyarrow.schema` factory function which makes a new Arrow + Schema object. + + Examples + -------- + Create a new Arrow Schema object: + + >>> import pyarrow as pa + >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + some_int: int32 + some_string: string + + Create Arrow Schema with metadata: + + >>> pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] + def __iter__(self) -> Iterator[Field]: ... + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: + """ + Return deserialized-from-JSON pandas metadata field (if it exists) + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> schema = pa.Table.from_pandas(df).schema + + Select pandas metadata field from Arrow Schema: + + >>> schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... + """ + @property + def names(self) -> list[str]: + """ + The schema's field names. + + Returns + ------- + list of str + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the names of the schema's fields: + + >>> schema.names + ['n_legs', 'animals'] + """ + @property + def types(self) -> list[DataType]: + """ + The schema's field types. + + Returns + ------- + list of DataType + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the types of the schema's fields: + + >>> schema.types + [DataType(int64), DataType(string)] + """ + @property + def metadata(self) -> dict[bytes, bytes]: + """ + The schema's metadata (if any is set). + + Returns + ------- + metadata: dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + + Get the metadata of the schema's fields: + + >>> schema.metadata + {b'n_legs': b'Number of legs per animal'} + """ + def empty_table(self) -> Table: + """ + Provide an empty table according to the schema. + + Returns + ------- + table: pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Create an empty table with schema's fields: + + >>> schema.empty_table() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[]] + animals: [[]] + """ + def equals(self, other: Schema, check_metadata: bool = False) -> bool: + """ + Test if this schema is equal to the other + + Parameters + ---------- + other : pyarrow.Schema + check_metadata : bool, default False + Key/value metadata must be equal too + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> schema1 = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + + Test two equal schemas: + + >>> schema1.equals(schema1) + True + + Test two unequal schemas: + + >>> schema1.equals(schema2) + False + """ + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: + """ + Returns implied schema from dataframe + + Parameters + ---------- + df : pandas.DataFrame + preserve_index : bool, default True + Whether to store the index as an additional column (or columns, for + MultiIndex) in the resulting `Table`. + The default of None will store the index as a column, except for + RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) + + Create an Arrow Schema from the schema of a pandas dataframe: + + >>> pa.Schema.from_pandas(df) + int: int64 + str: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, ... + """ + def field(self, i: int | str | bytes) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or string + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Select the second field: + + >>> schema.field(1) + pyarrow.Field + + Select the field of the column named 'n_legs': + + >>> schema.field("n_legs") + pyarrow.Field + """ + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: + """ + DEPRECATED + + Parameters + ---------- + name : str + + Returns + ------- + field: pyarrow.Field + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the index of the field named 'animals': + + >>> schema.get_field_index("animals") + 1 + + Index in case of several fields with the given name: + + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema.get_field_index("animals") + -1 + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ] + ... ) + + Get the indexes of the fields named 'animals': + + >>> schema.get_all_field_indices("animals") + [1, 2] + """ + def append(self, field: Field) -> Schema: + """ + Append a field at the end of the schema. + + In contrast to Python's ``list.append()`` it does return a new + object, leaving the original Schema unmodified. + + Parameters + ---------- + field : Field + + Returns + ------- + schema: Schema + New object with appended field. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Append a field 'extra' at the end of the schema: + + >>> schema_new = schema.append(pa.field("extra", pa.bool_())) + >>> schema_new + n_legs: int64 + animals: string + extra: bool + + Original schema is unmodified: + + >>> schema + n_legs: int64 + animals: string + """ + def insert(self, i: int, field: Field) -> Schema: + """ + Add a field at position i to the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Insert a new field on the second position: + + >>> schema.insert(1, pa.field("extra", pa.bool_())) + n_legs: int64 + extra: bool + animals: string + """ + def remove(self, i: int) -> Schema: + """ + Remove the field at index i from the schema. + + Parameters + ---------- + i : int + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Remove the second field of the schema: + + >>> schema.remove(1) + n_legs: int64 + """ + def set(self, i: int, field: Field) -> Schema: + """ + Replace a field at position i in the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Replace the second field of the schema with a new field 'extra': + + >>> schema.set(1, pa.field("replaced", pa.bool_())) + n_legs: int64 + replaced: bool + """ + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: + """ + DEPRECATED + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + """ + def with_metadata(self, metadata: dict) -> Schema: + """ + Add metadata as dict of string keys and values to Schema + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Add metadata to existing schema field: + + >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write Schema to Buffer as encapsulated IPC message + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Write schema to Buffer: + + >>> schema.serialize() + + """ + def remove_metadata(self) -> Schema: + """ + Create new schema without metadata, if any + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Create a new schema with removing the metadata from the original: + + >>> schema.remove_metadata() + n_legs: int64 + animals: string + """ + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: + """ + Return human-readable representation of Schema + + Parameters + ---------- + truncate_metadata : boolean, default True + Limit metadata key/value display to a single line of ~80 characters + or less + show_field_metadata : boolean, default True + Display Field-level KeyValueMetadata + show_schema_metadata : boolean, default True + Display Schema-level KeyValueMetadata + + Returns + ------- + str : the formatted output + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: + """ + Import Schema from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: + """ + Import a Schema from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: + """ + Unify schemas by merging fields by name. + + The resulting schema will contain the union of fields from all schemas. + Fields with the same name will be merged. Note that two fields with + different types will fail merging by default. + + - The unified field will inherit the metadata from the schema where + that field is first defined. + - The first N fields in the schema will be ordered the same as the + N fields in the first schema. + + The resulting schema will inherit its metadata from the first input + schema. + + Parameters + ---------- + schemas : list of Schema + Schemas to merge into a single one. + promote_options : str, default default + Accepts strings "default" and "permissive". + Default: null and only null can be unified with another type. + Permissive: types are promoted to the greater common denominator. + + Returns + ------- + Schema + + Raises + ------ + ArrowInvalid : + If any input schema contains fields with duplicate names. + If Fields of the same name are not mergeable. + """ + +@overload +def field(name: SupportArrowSchema) -> Field[Any]: ... +@overload +def field( + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT]: ... +def field(*args, **kwargs): + """ + Create a pyarrow.Field instance. + + Parameters + ---------- + name : str or bytes + Name of the field. + Alternatively, you can also pass an object that implements the Arrow + PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). + type : pyarrow.DataType or str + Arrow datatype of the field or a string matching one. + nullable : bool, default True + Whether the field's values are nullable. + metadata : dict, default None + Optional field metadata, the keys and values must be coercible to + bytes. + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + + A str can also be passed for the type parameter: + + >>> pa.field("key", "int32") + pyarrow.Field + """ + +def null() -> NullType: + """ + Create instance of null type. + + Examples + -------- + Create an instance of a null type: + + >>> import pyarrow as pa + >>> pa.null() + DataType(null) + >>> print(pa.null()) + null + + Create a ``Field`` type with a null type and a name: + + >>> pa.field("null_field", pa.null()) + pyarrow.Field + """ + +def bool_() -> BoolType: + """ + Create instance of boolean type. + + Examples + -------- + Create an instance of a boolean type: + + >>> import pyarrow as pa + >>> pa.bool_() + DataType(bool) + >>> print(pa.bool_()) + bool + + Create a ``Field`` type with a boolean type + and a name: + + >>> pa.field("bool_field", pa.bool_()) + pyarrow.Field + """ + +def uint8() -> UInt8Type: + """ + Create instance of unsigned int8 type. + + Examples + -------- + Create an instance of unsigned int8 type: + + >>> import pyarrow as pa + >>> pa.uint8() + DataType(uint8) + >>> print(pa.uint8()) + uint8 + + Create an array with unsigned int8 type: + + >>> pa.array([0, 1, 2], type=pa.uint8()) + + [ + 0, + 1, + 2 + ] + """ + +def int8() -> Int8Type: + """ + Create instance of signed int8 type. + + Examples + -------- + Create an instance of int8 type: + + >>> import pyarrow as pa + >>> pa.int8() + DataType(int8) + >>> print(pa.int8()) + int8 + + Create an array with int8 type: + + >>> pa.array([0, 1, 2], type=pa.int8()) + + [ + 0, + 1, + 2 + ] + """ + +def uint16() -> UInt16Type: + """ + Create instance of unsigned uint16 type. + + Examples + -------- + Create an instance of unsigned int16 type: + + >>> import pyarrow as pa + >>> pa.uint16() + DataType(uint16) + >>> print(pa.uint16()) + uint16 + + Create an array with unsigned int16 type: + + >>> pa.array([0, 1, 2], type=pa.uint16()) + + [ + 0, + 1, + 2 + ] + """ + +def int16() -> Int16Type: + """ + Create instance of signed int16 type. + + Examples + -------- + Create an instance of int16 type: + + >>> import pyarrow as pa + >>> pa.int16() + DataType(int16) + >>> print(pa.int16()) + int16 + + Create an array with int16 type: + + >>> pa.array([0, 1, 2], type=pa.int16()) + + [ + 0, + 1, + 2 + ] + """ + +def uint32() -> Uint32Type: + """ + Create instance of unsigned uint32 type. + + Examples + -------- + Create an instance of unsigned int32 type: + + >>> import pyarrow as pa + >>> pa.uint32() + DataType(uint32) + >>> print(pa.uint32()) + uint32 + + Create an array with unsigned int32 type: + + >>> pa.array([0, 1, 2], type=pa.uint32()) + + [ + 0, + 1, + 2 + ] + """ + +def int32() -> Int32Type: + """ + Create instance of signed int32 type. + + Examples + -------- + Create an instance of int32 type: + + >>> import pyarrow as pa + >>> pa.int32() + DataType(int32) + >>> print(pa.int32()) + int32 + + Create an array with int32 type: + + >>> pa.array([0, 1, 2], type=pa.int32()) + + [ + 0, + 1, + 2 + ] + """ + +def int64() -> Int64Type: + """ + Create instance of signed int64 type. + + Examples + -------- + Create an instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> print(pa.int64()) + int64 + + Create an array with int64 type: + + >>> pa.array([0, 1, 2], type=pa.int64()) + + [ + 0, + 1, + 2 + ] + """ + +def uint64() -> UInt64Type: + """ + Create instance of unsigned uint64 type. + + Examples + -------- + Create an instance of unsigned int64 type: + + >>> import pyarrow as pa + >>> pa.uint64() + DataType(uint64) + >>> print(pa.uint64()) + uint64 + + Create an array with unsigned uint64 type: + + >>> pa.array([0, 1, 2], type=pa.uint64()) + + [ + 0, + 1, + 2 + ] + """ + +def tzinfo_to_string(tz: dt.tzinfo) -> str: + """ + Converts a time zone object into a string indicating the name of a time + zone, one of: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + tz : datetime.tzinfo + Time zone object + + Returns + ------- + name : str + Time zone name + """ + +def string_to_tzinfo(name: str) -> dt.tzinfo: + """ + Convert a time zone name into a time zone object. + + Supported input strings are: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + name: str + Time zone name. + + Returns + ------- + tz : datetime.tzinfo + Time zone object + """ + +@overload +def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... +@overload +def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def timestamp(*args, **kwargs): + """ + Create instance of timestamp type with resolution and optional time zone. + + Parameters + ---------- + unit : str + one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' + [nanosecond] + tz : str, default None + Time zone name. None indicates time zone naive + + Examples + -------- + Create an instance of timestamp type: + + >>> import pyarrow as pa + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + >>> pa.timestamp("s", tz="America/New_York") + TimestampType(timestamp[s, tz=America/New_York]) + >>> pa.timestamp("s", tz="+07:30") + TimestampType(timestamp[s, tz=+07:30]) + + Use timestamp type when creating a scalar object: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) + + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) + + + Returns + ------- + timestamp_type : TimestampType + """ + +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: + """ + Create instance of 32-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + one of 's' [second], or 'ms' [millisecond] + + Returns + ------- + type : pyarrow.Time32Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time32("s") + Time32Type(time32[s]) + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: + """ + Create instance of 64-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + One of 'us' [microsecond], or 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.Time64Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + >>> pa.time64("ns") + Time64Type(time64[ns]) + """ + +def duration(unit: _Unit) -> DurationType[_Unit]: + """ + Create instance of a duration type with unit resolution. + + Parameters + ---------- + unit : str + One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or + 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.DurationType + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("us") + DurationType(duration[us]) + >>> pa.duration("s") + DurationType(duration[s]) + + Create an array with duration type: + + >>> pa.array([0, 1, 2], type=pa.duration("s")) + + [ + 0, + 1, + 2 + ] + """ + +def month_day_nano_interval() -> MonthDayNanoIntervalType: + """ + Create instance of an interval type representing months, days and + nanoseconds between two dates. + + Examples + -------- + Create an instance of an month_day_nano_interval type: + + >>> import pyarrow as pa + >>> pa.month_day_nano_interval() + DataType(month_day_nano_interval) + + Create a scalar with month_day_nano_interval type: + + >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) + + """ + +def date32() -> Date32Type: + """ + Create instance of 32-bit date (days since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 32-bit date type: + + >>> import pyarrow as pa + >>> pa.date32() + DataType(date32[day]) + + Create a scalar with 32-bit date type: + + >>> from datetime import date + >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) + + """ + +def date64() -> Date64Type: + """ + Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 64-bit date type: + + >>> import pyarrow as pa + >>> pa.date64() + DataType(date64[ms]) + + Create a scalar with 64-bit date type: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) + + """ + +def float16() -> Float16Type: + """ + Create half-precision floating point type. + + Examples + -------- + Create an instance of float16 type: + + >>> import pyarrow as pa + >>> pa.float16() + DataType(halffloat) + >>> print(pa.float16()) + halffloat + + Create an array with float16 type: + + >>> arr = np.array([1.5, np.nan], dtype=np.float16) + >>> a = pa.array(arr, type=pa.float16()) + >>> a + + [ + 15872, + 32256 + ] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [, ] + """ + +def float32() -> Float32Type: + """ + Create single-precision floating point type. + + Examples + -------- + Create an instance of float32 type: + + >>> import pyarrow as pa + >>> pa.float32() + DataType(float) + >>> print(pa.float32()) + float + + Create an array with float32 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) + + [ + 0, + 1, + 2 + ] + """ + +def float64() -> Float64Type: + """ + Create double-precision floating point type. + + Examples + -------- + Create an instance of float64 type: + + >>> import pyarrow as pa + >>> pa.float64() + DataType(double) + >>> print(pa.float64()) + double + + Create an array with float64 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) + + [ + 0, + 1, + 2 + ] + """ + +@overload +def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... +@overload +def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... +def decimal32(*args, **kwargs): + """ + Create decimal type with precision and scale and 32-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal32(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 32-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal32(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 32-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 9 significant digits, consider + using ``decimal64``, ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 9 + scale : int + + Returns + ------- + decimal_type : Decimal32Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal32(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... +@overload +def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... +def decimal64(*args, **kwargs): + """ + Create decimal type with precision and scale and 64-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal64(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 64-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal64(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 64-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 18 significant digits, consider + using ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 18 + scale : int + + Returns + ------- + decimal_type : Decimal64Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal64(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... +@overload +def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... +def decimal128(*args, **kwargs): + """ + Create decimal type with precision and scale and 128-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal128(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 128-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal128(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 128-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 38 significant digits, consider + using ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 38 + scale : int + + Returns + ------- + decimal_type : Decimal128Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal128(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... +@overload +def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... +def decimal256(*args, **kwargs): + """ + Create decimal type with precision and scale and 256-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + For most use cases, the maximum precision offered by ``decimal128`` + is sufficient, and it will result in a more compact and more efficient + encoding. ``decimal256`` is useful if you need a precision higher + than 38 significant digits. + + Parameters + ---------- + precision : int + Must be between 1 and 76 + scale : int + + Returns + ------- + decimal_type : Decimal256Type + """ + +def string() -> StringType: + """ + Create UTF8 variable-length string type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string() + DataType(string) + + and use the string type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.string()) + + [ + "foo", + "bar", + "baz" + ] + """ + +utf8 = string +""" +Alias for string(). + +Examples +-------- +Create an instance of a string type: + +>>> import pyarrow as pa +>>> pa.utf8() +DataType(string) + +and use the string type to create an array: + +>>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + +[ + "foo", + "bar", + "baz" +] +""" + +@overload +def binary(length: Literal[-1] = ...) -> BinaryType: ... +@overload +def binary(length: int) -> FixedSizeBinaryType: ... +def binary(length): + """ + Create variable-length or fixed size binary type. + + Parameters + ---------- + length : int, optional, default -1 + If length == -1 then return a variable length binary type. If length is + greater than or equal to 0 then return a fixed size binary type of + width `length`. + + Examples + -------- + Create an instance of a variable-length binary type: + + >>> import pyarrow as pa + >>> pa.binary() + DataType(binary) + + and use the variable-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) + + [ + 666F6F, + 626172, + 62617A + ] + + Create an instance of a fixed-size binary type: + + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + + and use the fixed-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_binary() -> LargeBinaryType: + """ + Create large variable-length binary type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer binary(). + + Examples + -------- + Create an instance of large variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_binary() + DataType(large_binary) + + and use the type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_string() -> LargeStringType: + """ + Create large UTF8 variable-length string type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer string(). + + Examples + -------- + Create an instance of large UTF8 variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_string() + DataType(large_string) + + and use the type to create an array: + + >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) + + [ + "foo", + "bar", + ... + "foo", + "bar" + ] + """ + +large_utf8 = large_string +""" +Alias for large_string(). + +Examples +-------- +Create an instance of large UTF8 variable-length binary type: + +>>> import pyarrow as pa +>>> pa.large_utf8() +DataType(large_string) + +and use the type to create an array: + +>>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) + +[ + "foo", + "bar", + ... + "foo", + "bar" +] +""" + +def binary_view() -> BinaryViewType: + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + +def string_view() -> StringViewType: + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... +) -> ListType[_DataTypeT]: ... +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size +) -> FixedSizeListType[_DataTypeT, _Size]: ... +def list_(*args, **kwargs): + """ + Create ListType instance from child data type or field. + + Parameters + ---------- + value_type : DataType or Field + list_size : int, optional, default -1 + If length == -1 then return a variable length list type. If length is + greater than or equal to 0 then return a fixed size list type. + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + + Use the ListType to create a scalar: + + >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) + + + or an array: + + >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + +def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: + """ + Create LargeListType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.int8()) + LargeListType(large_list) + + Use the LargeListType to create an array: + + >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) + + [ + [ + -1, + 3 + ], + [ + -1, + 3 + ], + ... + """ + +def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + +@overload +def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... +@overload +def map_( + key_type: _K, item_type: _ValueT, key_sorted: _Ordered +) -> MapType[_K, _ValueT, _Ordered]: ... +def map_(*args, **kwargs): + """ + Create MapType instance from key and item data types or fields. + + Parameters + ---------- + key_type : DataType or Field + item_type : DataType or Field + keys_sorted : bool + + Returns + ------- + map_type : DataType + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + + Use MapType to create an array: + + >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] + >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) + + [ + keys: + [ + "a", + "b" + ] + values: + [ + 1, + 2 + ], + keys: + [ + "c" + ] + values: + [ + 3 + ] + ] + """ + +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +def dictionary(*args, **kwargs): + """ + Dictionary (categorical, or simply encoded) type. + + Parameters + ---------- + index_type : DataType + value_type : DataType + ordered : bool + + Returns + ------- + type : DictionaryType + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + + Use dictionary type to create an array: + + >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) + + ... + -- dictionary: + [ + "a", + "b", + "d" + ] + -- indices: + [ + 0, + 1, + null, + 2 + ] + """ + +def struct( + fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] + | Mapping[str, Field[Any]], +) -> StructType: + """ + Create StructType instance from fields. + + A struct is a nested type parameterized by an ordered sequence of types + (which can all be distinct), called its fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + + Examples + -------- + Create an instance of StructType from an iterable of tuples: + + >>> import pyarrow as pa + >>> fields = [ + ... ("f1", pa.int32()), + ... ("f2", pa.string()), + ... ] + >>> struct_type = pa.struct(fields) + >>> struct_type + StructType(struct) + + Retrieve a field from a StructType: + + >>> struct_type[0] + pyarrow.Field + >>> struct_type["f1"] + pyarrow.Field + + Create an instance of StructType from an iterable of Fields: + + >>> fields = [ + ... pa.field("f1", pa.int32()), + ... pa.field("f2", pa.string(), nullable=False), + ... ] + >>> pa.struct(fields) + StructType(struct) + + Returns + ------- + type : DataType + """ + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: + """ + Create SparseUnionType from child fields. + + A sparse union is a nested type where each logical value is taken from + a single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + In a sparse union, each child array should have the same length as the + union array, regardless of the actual number of union values that + refer to it. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : SparseUnionType + """ + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: + """ + Create DenseUnionType from child fields. + + A dense union is a nested type where each logical value is taken from + a single child, at a specific offset. A buffer of 8-bit type ids + indicates which child a given logical value is to be taken from, + and a buffer of 32-bit offsets indicates at which physical position + in the given child array the logical value is to be taken from. + + Unlike a sparse union, a dense union allows encoding only the child array + values which are actually referred to by the union array. This is + counterbalanced by the additional footprint of the offsets buffer, and + the additional indirection cost when looking up values. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : DenseUnionType + """ + +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None +) -> SparseUnionType: ... +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None +) -> DenseUnionType: ... +def union(*args, **kwargs): + """ + Create UnionType from child fields. + + A union is a nested type where each logical value is taken from a + single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + Unions come in two flavors: sparse and dense + (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + mode : str + Must be 'sparse' or 'dense' + type_codes : list of integers, default None + + Returns + ------- + type : UnionType + """ + +def run_end_encoded( + run_end_type: _RunEndType, value_type: _BasicValueT +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: + """ + Create RunEndEncodedType from run-end and value types. + + Parameters + ---------- + run_end_type : pyarrow.DataType + The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. + value_type : pyarrow.DataType + The type of the values array. + + Returns + ------- + type : RunEndEncodedType + """ + +def json_(storage_type: DataType = ...) -> JsonType: + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + +def uuid() -> UuidType: + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired logical + ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple or list of integers + The physical shape of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + +def bool8() -> Bool8Type: + """ + Create instance of bool8 extension type. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(int8) + + Create a table with a bool8 array: + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[-1,0,1,2,null]] + + Returns + ------- + type : Bool8Type + """ + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: + """ + Create instance of opaque extension type. + + Parameters + ---------- + storage_type : DataType + The underlying data type. + type_name : str + The name of the type in the external system. + vendor_name : str + The name of the external system. + + Examples + -------- + Create an instance of an opaque extension type: + + >>> import pyarrow as pa + >>> type = pa.opaque(pa.binary(), "other", "jdbc") + >>> type + OpaqueType(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(binary) + >>> type.type_name + 'other' + >>> type.vendor_name + 'jdbc' + + Create a table with an opaque array: + + >>> arr = [None, b"foobar"] + >>> storage = pa.array(arr, pa.binary()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[null,666F6F626172]] + + Returns + ------- + type : OpaqueType + """ + +@overload +def type_for_alias(name: Literal["null"]) -> NullType: ... +@overload +def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def type_for_alias(name: Literal["binary"]) -> BinaryType: ... +@overload +def type_for_alias( + name: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... +@overload +def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def type_for_alias(name): + """ + Return DataType given a string alias if one exists. + + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + + Returns + ------- + type : DataType + """ + +@overload +def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... +@overload +def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... +@overload +def ensure_type(ty: Literal["null"]) -> NullType: ... +@overload +def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def ensure_type(ty: Literal["binary"]) -> BinaryType: ... +@overload +def ensure_type( + ty: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... +@overload +def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def schema( + fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], + metadata: dict[bytes | str, bytes | str] | None = None, +) -> Schema: + """ + Construct pyarrow.Schema from collection of fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Can also pass an object that implements the Arrow PyCapsule Protocol + for schemas (has an ``__arrow_c_schema__`` method). + metadata : dict, default None + Keys and values must be coercible to bytes. + + Examples + -------- + Create a Schema from iterable of tuples: + + >>> import pyarrow as pa + >>> pa.schema( + ... [ + ... ("some_int", pa.int32()), + ... ("some_string", pa.string()), + ... pa.field("some_required_string", pa.string(), nullable=False), + ... ] + ... ) + some_int: int32 + some_string: string + some_required_string: string not null + + Create a Schema from iterable of Fields: + + >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) + some_int: int32 + some_string: string + + DataTypes can also be passed as strings. The following is equivalent to the + above example: + + >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) + some_int: int32 + some_string: string + + Or more concisely: + + >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) + some_int: int32 + some_string: string + + Returns + ------- + schema : pyarrow.Schema + """ + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: + """ + Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert + + + Examples + -------- + Create a pyarrow DataType from NumPy dtype: + + >>> import pyarrow as pa + >>> import numpy as np + >>> pa.from_numpy_dtype(np.dtype("float16")) + DataType(halffloat) + >>> pa.from_numpy_dtype("U") + DataType(string) + >>> pa.from_numpy_dtype(bool) + DataType(bool) + >>> pa.from_numpy_dtype(np.str_) + DataType(string) + """ + +def is_boolean_value(obj: Any) -> bool: + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_integer_value(obj: Any) -> bool: + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_float_value(obj: Any) -> bool: + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "Uint32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "ensure_metadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "tzinfo_to_string", + "string_to_tzinfo", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "ensure_type", + "schema", + "from_numpy_dtype", + "is_boolean_value", + "is_integer_value", + "is_float_value", +] diff --git a/python/stubs/_azurefs.pyi b/python/stubs/_azurefs.pyi new file mode 100644 index 00000000000..317943ce20f --- /dev/null +++ b/python/stubs/_azurefs.pyi @@ -0,0 +1,74 @@ +from typing import Literal + +from ._fs import FileSystem + +class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. If sas_token and account_key are None the + default credential will be used. The parameters account_key and sas_token are + mutually exclusive. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + sas_token : str, default None + SAS token for the storage account, used as an alternative to account_key. If sas_token + and account_key are None the default credential will be used. The parameters + account_key and sas_token are mutually exclusive. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") + >>> azurite_fs = fs.AzureFileSystem( + ... account_name="devstoreaccount1", + ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ... blob_storage_authority="127.0.0.1:10000", + ... dfs_storage_authority="127.0.0.1:10000", + ... blob_storage_scheme="http", + ... dfs_storage_scheme="http", + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, + ) -> None: ... diff --git a/python/stubs/_compute.pyi b/python/stubs/_compute.pyi new file mode 100644 index 00000000000..3d61ae42787 --- /dev/null +++ b/python/stubs/_compute.pyi @@ -0,0 +1,1721 @@ +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + +class Kernel(lib._Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ + +class Function(lib._Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ + @property + def arity(self) -> int: + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: + """ + The function kind. + """ + @property + def name(self) -> str: + """ + The function name. + """ + @property + def num_kernels(self) -> int: + """ + The number of kernels implementing this function. + """ + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + + def list_functions(self) -> list[str]: + """ + Return all function names in the registry. + """ + +class HashAggregateFunction(Function): ... +class HashAggregateKernel(Kernel): ... +class ScalarAggregateFunction(Function): ... +class ScalarAggregateKernel(Kernel): ... +class ScalarFunction(Function): ... +class ScalarKernel(Kernel): ... +class VectorFunction(Function): ... +class VectorKernel(Kernel): ... + +# ==================== _compute.pyx Option classes ==================== +class ArraySortOptions(FunctionOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + +class AssumeTimezoneOptions(FunctionOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + +class CastOptions(FunctionOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + +class CountOptions(FunctionOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... + +class CumulativeOptions(FunctionOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class CumulativeSumOptions(FunctionOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class DayOfWeekOptions(FunctionOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... + +class DictionaryEncodeOptions(FunctionOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + +class RunEndEncodeOptions(FunctionOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + +class ElementWiseAggregateOptions(FunctionOptions): + """ + Options for element-wise aggregate functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + """ + def __init__(self, *, skip_nulls: bool = True) -> None: ... + +class ExtractRegexOptions(FunctionOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class ExtractRegexSpanOptions(FunctionOptions): + """ + Options for the `extract_regex_span` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class FilterOptions(FunctionOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... + +class IndexOptions(FunctionOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ + def __init__(self, value: lib.Scalar) -> None: ... + +class JoinOptions(FunctionOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ + @overload + def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload + def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... + +class ListSliceOptions(FunctionOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + +class ListFlattenOptions(FunctionOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + def __init__(self, recursive: bool = False) -> None: ... + +class MakeStructOptions(FunctionOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + +class MapLookupOptions(FunctionOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + +class MatchSubstringOptions(FunctionOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + +class ModeOptions(FunctionOptions): + """ + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class NullOptions(FunctionOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ + def __init__(self, *, nan_is_null: bool = False) -> None: ... + +class PadOptions(FunctionOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + +class PairwiseOptions(FunctionOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ + def __init__(self, period: int = 1) -> None: ... + +class PartitionNthOptions(FunctionOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ + def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... + +class WinsorizeOptions(FunctionOptions): + """ + Options for the `winsorize` function. + + Parameters + ---------- + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile's value. + """ + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + +class QuantileOptions(FunctionOptions): + """ + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class RandomOptions(FunctionOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + +class RankOptions(FunctionOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + +class RankQuantileOptions(FunctionOptions): + """ + Options for the `rank_quantile` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + """ + + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + +class PivotWiderOptions(FunctionOptions): + """ + Options for the `pivot_wider` function. + + Parameters + ---------- + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + """ + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + +class ReplaceSliceOptions(FunctionOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + +class ReplaceSubstringOptions(FunctionOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + +class RoundBinaryOptions(FunctionOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +class RoundOptions(FunctionOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + +class RoundTemporalOptions(FunctionOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + """ + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + +class RoundToMultipleOptions(FunctionOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + +class ScalarAggregateOptions(FunctionOptions): + """ + Options for scalar aggregations. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + +class SelectKOptions(FunctionOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + +class SetLookupOptions(FunctionOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + +class SliceOptions(FunctionOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + +class SortOptions(FunctionOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... + +class SplitOptions(FunctionOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... + +class SplitPatternOptions(FunctionOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + +class StrftimeOptions(FunctionOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... + +class StrptimeOptions(FunctionOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... + +class StructFieldOptions(FunctionOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... + +class TakeOptions(FunctionOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ + def __init__(self, boundscheck: bool = True) -> None: ... + +class TDigestOptions(FunctionOptions): + """ + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class TrimOptions(FunctionOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ + def __init__(self, characters: str) -> None: ... + +class Utf8NormalizeOptions(FunctionOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + +class VarianceOptions(FunctionOptions): + """ + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class SkewOptions(FunctionOptions): + """ + Options for the `skew` and `kurtosis` functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + +class WeekOptions(FunctionOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + +def list_functions() -> list[str]: + """ + Return all function names in the global registry. + """ + +# ==================== _compute.pyx Udf ==================== + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +class _FunctionDoc(TypedDict): + summary: str + description: str + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) + >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + +class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + + @property + def batch_length(self) -> int: + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ + @property + def memory_pool(self) -> lib.MemoryPool: + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ + +# ==================== _compute.pyx Expression ==================== +class Expression(lib._Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field("a") != 3 + + >>> pc.field("a").isin([1, 2, 3]) + + """ + + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + message : bytes or Buffer or a protobuf Message + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + def is_null(self, nan_is_null: bool = False) -> Expression: + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + def is_nan(self) -> Expression: + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + def isin(self, values: lib.Array | Iterable) -> Expression: + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ + +# ==================== _compute.py ==================== diff --git a/python/stubs/_csv.pyi b/python/stubs/_csv.pyi new file mode 100644 index 00000000000..2f49f8c9a6c --- /dev/null +++ b/python/stubs/_csv.pyi @@ -0,0 +1,641 @@ +from dataclasses import dataclass, field +from typing import IO, Any, Callable, Literal + +from _typeshed import StrPath + +from . import lib + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + """ + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] + """ + + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + """ + Options for parsing CSV files. + + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return "skip" + ... else: + ... return "error" + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + """ + Options for converting CSV data. + + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"], strings_can_be_null=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ + + check_utf8: bool = field(default=True, kw_only=False) + column_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + """ + Options for writing CSV files. + + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + +@dataclass +class InvalidRow(lib._Weakrefable): + """ + Description of an invalid row in a CSV file. + + Parameters + ---------- + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ + + expected_columns: int + actual_columns: int + number: int | None + text: str + +class CSVWriter(lib._CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + """ + + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class CSVStreamingReader(lib.RecordBatchReader): ... + +ISO8601: lib._Weakrefable + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.csv.CSVStreamingReader` + """ + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: + """ + Read a Table from a stream of CSV data. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: + """ + Write record batch or table to a CSV file. + + Parameters + ---------- + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + """ diff --git a/python/stubs/_cuda.pyi b/python/stubs/_cuda.pyi new file mode 100644 index 00000000000..ad52b2f380f --- /dev/null +++ b/python/stubs/_cuda.pyi @@ -0,0 +1,556 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + +class Context(lib._Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + @staticmethod + def get_num_devices() -> int: + """Return the number of GPU devices.""" + @property + def device_number(self) -> int: + """Return context device number.""" + @property + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ + @property + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + @property + def memory_manager(self) -> lib.MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + @property + def device(self) -> lib.Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + +class IpcMemHandle(lib._Weakrefable): + """A serializable container for a CUDA IPC handle.""" + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + +class CudaBuffer(lib.Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + @property + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" + +class HostBuffer(lib.Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + @property + def size(self) -> int: ... + +class BufferReader(lib.NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + +class BufferWriter(lib.NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + @property + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" + @buffer_size.setter + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + @property + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/python/stubs/_dataset.pyi b/python/stubs/_dataset.pyi new file mode 100644 index 00000000000..af864f9154b --- /dev/null +++ b/python/stubs/_dataset.pyi @@ -0,0 +1,2299 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + TypeVar, + overload, +) + +from _typeshed import StrPath + +from . import _csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + +class Dataset(lib._Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + + @property + def partition_expression(self) -> Expression: + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + def replace_schema(self, schema: lib.Schema) -> None: + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + def get_fragments(self, filter: Expression | None = None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner( + ... columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... } + ... ).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + @property + def schema(self) -> lib.Schema: + """The common schema of the full Dataset""" + def filter(self, expression: Expression) -> Self: + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + +class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. + + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ + +class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + + @property + def children(self) -> list[Dataset]: ... + +class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ + @property + def files(self) -> list[str]: + """List of the files""" + @property + def format(self) -> FileFormat: + """The FileFormat of this source.""" + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + +class Fragment(lib._Weakrefable): + """Fragment of data from a Dataset.""" + @property + def physical_schema(self) -> lib.Schema: + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" + @property + def partition_expression(self) -> Expression: + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + +class FileFragment(Fragment): + """A Fragment representing a data file.""" + + def open(self) -> lib.NativeFile: + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ + @property + def path(self) -> str: + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ + @property + def filesystem(self) -> FileSystem: + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ + @property + def buffer(self) -> lib.Buffer: + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ + @property + def format(self) -> FileFormat: + """ + The format of the data file viewed by this fragment. + """ + +class FragmentScanOptions(lib._Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + + @property + def type_name(self) -> str: ... + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + +class FeatherFileFormat(IpcFileFormat): ... + +class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + def __init__( + self, + parse_options: _csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: _csv.ConvertOptions | None = None, + read_options: _csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> _csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + + convert_options: _csv.ConvertOptions + read_options: _csv.ReadOptions + + def __init__( + self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + +class CsvFileWriteOptions(FileWriteOptions): + write_options: _csv.WriteOptions + +class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + +class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + def format(self, expr: Expression) -> tuple[str, str]: + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) + >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) + ('1862/Jan', '') + """ + @property + def schema(self) -> lib.Schema: + """The arrow Schema attached to the partitioning.""" + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ + +class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + +class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class DatasetFactory(lib._Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + def inspect(self) -> lib.Schema: + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ + def inspect_schemas(self) -> list[lib.Schema]: ... + +class FileSystemFactoryOptions(lib._Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + +class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + +class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ + def __init__(self, factories: list[DatasetFactory]) -> None: ... + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + """An iterator over a sequence of record batches.""" + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + +class TaggedRecordBatch(NamedTuple): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + + record_batch: lib.RecordBatch + fragment: Fragment + +class TaggedRecordBatchIterator(lib._Weakrefable): + """An iterator over a sequence of record batches with fragments.""" + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + +class Scanner(lib._Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @overload + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch], + *, + schema: lib.Schema, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @overload + @staticmethod + def from_batches( + source: RecordBatchReader, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @staticmethod + def from_batches(*args, **kwargs): + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. + schema : Schema + The schema of the batches (required when passing a Python + iterator). + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @property + def dataset_schema(self) -> lib.Schema: + """The schema with which batches will be read from fragments.""" + @property + def projected_schema(self) -> lib.Schema: + """ + The materialized schema of the data, accounting for projections. + + This is the schema of any data returned from the scanner. + """ + def to_batches(self) -> Iterator[lib.RecordBatch]: + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def scan_batches(self) -> TaggedRecordBatchIterator: + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + def to_table(self) -> lib.Table: + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + def take(self, indices: Indices) -> lib.Table: + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + def head(self, num_rows: int) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + def count_rows(self) -> int: + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + def to_reader(self) -> RecordBatchReader: + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ + +class WrittenFile(lib._Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ + def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + +class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices when + implicit_ordering=True to enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + require_sequenced_output : bool, default False + Batches are yielded sequentially, like single-threaded + implicit_ordering : bool, default False + Preserve implicit ordering of data. + """ + + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/stubs/_dataset_orc.pyi b/python/stubs/_dataset_orc.pyi new file mode 100644 index 00000000000..9c4ac04198f --- /dev/null +++ b/python/stubs/_dataset_orc.pyi @@ -0,0 +1,6 @@ +from ._dataset import FileFormat + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/stubs/_dataset_parquet.pyi b/python/stubs/_dataset_parquet.pyi new file mode 100644 index 00000000000..cbcc17235f1 --- /dev/null +++ b/python/stubs/_dataset_parquet.pyi @@ -0,0 +1,314 @@ +from dataclasses import dataclass +from typing import IO, Any, Iterable, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable + +parquet_encryption_enabled: bool + +class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + +class _NameStats(TypedDict): + min: Any + max: Any + +class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + +class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + + def ensure_complete_metadata(self) -> None: ... + @property + def row_groups(self) -> list[RowGroupInfo]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ + +class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + +class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/stubs/_dataset_parquet_encryption.pyi b/python/stubs/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..7623275b865 --- /dev/null +++ b/python/stubs/_dataset_parquet_encryption.pyi @@ -0,0 +1,85 @@ +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + +class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/stubs/_feather.pyi b/python/stubs/_feather.pyi new file mode 100644 index 00000000000..8bb914ba45d --- /dev/null +++ b/python/stubs/_feather.pyi @@ -0,0 +1,29 @@ +from typing import IO + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + +class FeatherError(Exception): ... + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/python/stubs/_flight.pyi b/python/stubs/_flight.pyi new file mode 100644 index 00000000000..4450c42df49 --- /dev/null +++ b/python/stubs/_flight.pyi @@ -0,0 +1,1380 @@ +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + TimestampScalar, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ + +class CertKeyPair(NamedTuple): + """A TLS certificate and key for use in Flight.""" + + cert: str + key: str + +class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + + extra_info: bytes + +class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + +class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + +class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + +class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + +class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + +class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + +class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" + +class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + + limit: int + actual: int + +class Action(_Weakrefable): + """An action executable on a Flight service.""" + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ + @property + def type(self) -> str: + """The action type.""" + @property + def body(self) -> Buffer: + """The action body (arguments for the action).""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class ActionType(NamedTuple): + """A type of action that is executable on a Flight service.""" + + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ + +class Result(_Weakrefable): + """A result from executing an Action.""" + def __init__(self, buf: Buffer | bytes) -> None: + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ + @property + def body(self) -> Buffer: + """Get the Buffer containing the result.""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class BasicAuth(_Weakrefable): + """A container for basic auth.""" + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + +class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for a resource path.""" + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for an opaque command.""" + @property + def descriptor_type(self) -> DescriptorType: + """Get the type of this descriptor.""" + @property + def path(self) -> list[bytes] | None: + """Get the path for this descriptor.""" + @property + def command(self) -> bytes | None: + """Get the command for this descriptor.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + """The location of a Flight service.""" + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: + """Create a Location for a TCP-based gRPC service.""" + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: + """Create a Location for a TLS-based gRPC service.""" + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: + """Create a Location for a domain socket-based gRPC service.""" + +class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + expiration_time : TimestampScalar, default None + Expiration time of this stream. If present, clients may assume + they can retry DoGet requests. Otherwise, clients should avoid + retrying DoGet requests. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ + @property + def ticket(self) -> Ticket: + """Get the ticket in this endpoint.""" + @property + def locations(self) -> list[Location]: + """Get locations where this flight is available.""" + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: + """Get the expiration time of this stream. + + If present, clients may assume they can retry DoGet requests. + Otherwise, clients should avoid retrying DoGet requests. + + """ + @property + def app_metadata(self) -> bytes | str: + """Get application-defined opaque metadata.""" + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class SchemaResult(_Weakrefable): + """The serialized schema returned from a GetSchema request.""" + def __init__(self, schema: Schema) -> None: + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int, default None + the total records in this flight, -1 or None if unknown. + total_bytes : int, default None + the total bytes in this flight, -1 or None if unknown. + ordered : boolean, default False + Whether endpoints are in the same order as the data. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + @property + def descriptor(self) -> FlightDescriptor: + """The descriptor of the data in this flight.""" + @property + def endpoints(self) -> list[FlightEndpoint]: + """The endpoints where this flight is available.""" + @property + def total_records(self) -> int: + """The total record count of this flight, or -1 if unknown.""" + @property + def total_bytes(self) -> int: + """The size in bytes of the data in this flight, or -1 if unknown.""" + @property + def ordered(self) -> bool: + """Whether endpoints are in the same order as the data.""" + @property + def app_metadata(self) -> bytes | str: + """ + Application-defined opaque metadata. + + There is no inherent or required relationship between this and the + app_metadata fields in the FlightEndpoints or resulting FlightData + messages. Since this metadata is application-defined, a given + application could define there to be a relationship, but there is + none required by the spec. + + """ + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: + """Get the schema for this reader.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + def read_chunk(self) -> FlightStreamChunk: + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. + + Raises + ------ + StopIteration + when the stream is finished + """ + def to_reader(self) -> RecordBatchReader: + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ + +class FlightStreamReader(MetadataRecordBatchReader): + """A reader that can also be canceled.""" + def cancel(self) -> None: + """Cancel the read operation.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: + """Prepare to write data to this stream with the given schema.""" + def write_metadata(self, buf: Buffer) -> None: + """Write Flight metadata by itself.""" + def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ + +class FlightStreamWriter(MetadataRecordBatchWriter): + """A writer that also allows closing the write side of a stream.""" + def done_writing(self) -> None: + """Indicate that the client is done writing, but not done reading.""" + +class FlightMetadataReader(_Weakrefable): + """A reader for Flight metadata messages sent during a DoPut.""" + def read(self) -> Buffer | None: + """Read the next metadata message.""" + +class FlightMetadataWriter(_Weakrefable): + """A sender for Flight metadata messages during a DoPut.""" + def write(self, message: Buffer) -> None: + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ + +class AsyncioCall(Generic[_T]): + """State for an async RPC using asyncio.""" + + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: + """List the actions available on a service.""" + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: + """List the flights available on a service.""" + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: + """Request information about an available flight.""" + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: + """Request schema for an available flight.""" + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + def close(self) -> None: + """Close the client and disconnect.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + +class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ + +class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ + +class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ + +class ServerCallContext(_Weakrefable): + """Per-call state/context.""" + def peer_identity(self) -> bytes: + """Get the identity of the authenticated peer. + + May be the empty string. + """ + def peer(self) -> str: + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: + """Check if the current RPC call has been canceled by the client.""" + def add_header(self, key: str, value: str) -> None: + """Add a response header.""" + def add_trailer(self, key: str, value: str) -> None: + """Add a response trailer.""" + def get_middleware(self, key: str) -> ServerMiddleware | None: + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ + +class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" + def read(self) -> str: ... + +class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + def is_valid(self, token: str) -> bool: + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ + +class ClientAuthHandler(_Weakrefable): + """Authentication plugin for a client.""" + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + def get_token(self) -> str: + """Get the auth token for a call.""" + +class CallInfo(NamedTuple): + """Information about a particular RPC for Flight middleware.""" + + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + def start_call(self, info: CallInfo) -> ClientMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ + +class ClientMiddleware(_Weakrefable): + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ + +class ServerMiddleware(_Weakrefable): + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + def serve(self) -> None: + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + def run(self) -> None: + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + def shutdown(self) -> None: + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + def wait(self) -> None: + """Block until server is terminated with shutdown.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi new file mode 100644 index 00000000000..7670ef5230d --- /dev/null +++ b/python/stubs/_fs.pyi @@ -0,0 +1,1005 @@ +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Union, overload + +from fsspec import AbstractFileSystem # type: ignore[import-untyped] + +from .lib import NativeFile, _Weakrefable + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + +class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + "/pyarrow-fs-example.dat" + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b"data") + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: + """ + Type of the file. + + The returned enum values can be the following: + + - FileType.NotFound: target does not exist + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character device, or + Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: + """ + The full file path in the filesystem. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ + @property + def base_name(self) -> str: + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ + @property + def size(self) -> int: + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ + @property + def extension(self) -> str: + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ + @property + def mtime(self) -> dt.datetime | None: + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ + @property + def mtime_ns(self) -> int | None: + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + +class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector( + ... local_path + "/missing", recursive=True, allow_not_found=True + ... ) + >>> local.get_file_info(selector_not_found) + [] + """ + + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + +class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + + @classmethod + def from_uri(cls, uri: str) -> tuple[Self, str]: + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + def equals(self, other: FileSystem) -> bool: + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ + @property + def type_name(self) -> str: + """ + The filesystem's type name. + """ + @overload + def get_file_info(self, paths_or_selector: str) -> FileInfo: ... + @overload + def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... + def get_file_info(self, paths_or_selector): + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + def create_dir(self, path: str, *, recursive: bool = True) -> None: + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + def delete_dir(self, path: str) -> None: + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + def move(self, src: str, dest: str) -> None: + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir("/tmp/other_dir") + >>> local.copy_file(path, "/tmp/move_example.dat") + + Move the file: + + >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") + + Inspect the file info: + + >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") + + >>> local.get_file_info("/tmp/move_example.dat") + + + Delete the folder: + >>> local.delete_dir("/tmp/other_dir") + """ + def copy_file(self, src: str, dest: str) -> None: + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") + + Inspect the file info: + + >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") + + >>> local.get_file_info(path) + + """ + def delete_file(self, path: str) -> None: + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + def open_input_file(self, path: str) -> NativeFile: + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b"data") + 4 + """ + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b"+newly added") + 12 + + Print out the content to the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + def normalize_path(self, path: str) -> str: + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ + +class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: + ... print(stream.readall()) + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: + ... f.write(b"+newly added") + 12 + + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.delete_dir("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.delete_dir_contents("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.delete_file("/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.get_file_info("/tmp/new_folder") + + + Move the file: + + >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") + + >>> local.get_file_info("/tmp/local_fs-copy.dat") + + + To finish delete the file left: + >>> local.delete_file("/tmp/local_fs.dat") + """ + + def __init__(self, *, use_mmap: bool = False) -> None: ... + +class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir("/tmp/sub_tree") + >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) + + Write data into the existing file: + + >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: + ... f.write(b"+newly added") + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info("") + + >>> subtree.get_file_info("sub_tree_fs.dat") + + + Delete the file and directory: + + >>> subtree.delete_file("sub_tree_fs.dat") + >>> local.delete_dir("/tmp/sub_tree") + >>> local.delete_file("/tmp/local_fs.dat") + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + +class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info("README.md") # doctest: +SKIP + + """ + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ + +class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ + @abstractmethod + def get_type_name(self) -> str: + """ + Implement PyFileSystem.type_name. + """ + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ + @abstractmethod + def delete_dir(self, path: str) -> None: + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ + @abstractmethod + def delete_root_dir_contents(self) -> None: + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ + @abstractmethod + def delete_file(self, path: str) -> None: + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ + @abstractmethod + def move(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + @abstractmethod + def normalize_path(self, path: str) -> str: + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ diff --git a/python/stubs/_gcsfs.pyi b/python/stubs/_gcsfs.pyi new file mode 100644 index 00000000000..4fc7ea68e48 --- /dev/null +++ b/python/stubs/_gcsfs.pyi @@ -0,0 +1,83 @@ +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: + """ + The GCP location this filesystem will write to. + """ + @property + def project_id(self) -> str: + """ + The GCP project id this filesystem will use. + """ diff --git a/python/stubs/_hdfs.pyi b/python/stubs/_hdfs.pyi new file mode 100644 index 00000000000..200f669379b --- /dev/null +++ b/python/stubs/_hdfs.pyi @@ -0,0 +1,75 @@ +from _typeshed import StrPath + +from ._fs import FileSystem + +class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem( + ... host, port, user=user, kerb_ticket=ticket_cache_path + ... ) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ diff --git a/python/stubs/_json.pyi b/python/stubs/_json.pyi new file mode 100644 index 00000000000..43d2ae83cd8 --- /dev/null +++ b/python/stubs/_json.pyi @@ -0,0 +1,169 @@ +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + +class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + + use_threads: bool + """ + Whether to use multiple threads to accelerate reading. + """ + block_size: int + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ + +class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + + explicit_schema: Schema + """ + Optional explicit schema (no type inference, ignores other fields) + """ + newlines_in_values: bool + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ + unexpected_field_behavior: Literal["ignore", "error", "infer"] + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + +class JSONStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a JSON file. + + Should not be instantiated directly by user code. + """ + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: + """ + Open a streaming reader of JSON data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of JSON data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see pyarrow.json.ReadOptions constructor + for defaults) + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see pyarrow.json.ParseOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.json.JSONStreamingReader` + """ diff --git a/python/stubs/_orc.pyi b/python/stubs/_orc.pyi new file mode 100644 index 00000000000..71bf0dde9ba --- /dev/null +++ b/python/stubs/_orc.pyi @@ -0,0 +1,56 @@ +from typing import IO, Literal + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/stubs/_parquet.pyi b/python/stubs/_parquet.pyi new file mode 100644 index 00000000000..a9187df0428 --- /dev/null +++ b/python/stubs/_parquet.pyi @@ -0,0 +1,445 @@ +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def hash_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( + self, + source: StrPath | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + +class FileEncryptionProperties: ... +class FileDecryptionProperties: ... diff --git a/python/stubs/_parquet_encryption.pyi b/python/stubs/_parquet_encryption.pyi new file mode 100644 index 00000000000..c707edb844a --- /dev/null +++ b/python/stubs/_parquet_encryption.pyi @@ -0,0 +1,67 @@ +import datetime as dt + +from typing import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/stubs/_s3fs.pyi b/python/stubs/_s3fs.pyi new file mode 100644 index 00000000000..fc13c498bd9 --- /dev/null +++ b/python/stubs/_s3fs.pyi @@ -0,0 +1,74 @@ +import enum + +from typing import Literal, NotRequired, Required, TypedDict + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi new file mode 100644 index 00000000000..c259513f1ea --- /dev/null +++ b/python/stubs/_stubs_typing.pyi @@ -0,0 +1,80 @@ +import datetime as dt + +from collections.abc import Sequence +from decimal import Decimal +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar + +import numpy as np + +from numpy.typing import NDArray + +from .compute import BooleanArray, IntegerArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray +PyScalar: TypeAlias = ( + bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta +) + +_T = TypeVar("_T") +SingleOrList: TypeAlias = list[_T] | _T + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] +) + +class Buffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportPyBuffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... diff --git a/python/stubs/_substrait.pyi b/python/stubs/_substrait.pyi new file mode 100644 index 00000000000..ff226e9521b --- /dev/null +++ b/python/stubs/_substrait.pyi @@ -0,0 +1,39 @@ +from typing import Any, Callable + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/stubs/acero.pyi b/python/stubs/acero.pyi new file mode 100644 index 00000000000..8a520bdc24a --- /dev/null +++ b/python/stubs/acero.pyi @@ -0,0 +1,85 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + +class ExecNodeOptions(lib._Weakrefable): ... + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/python/stubs/benchmark.pyi b/python/stubs/benchmark.pyi new file mode 100644 index 00000000000..048973301dc --- /dev/null +++ b/python/stubs/benchmark.pyi @@ -0,0 +1,3 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull + +__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/stubs/cffi.pyi b/python/stubs/cffi.pyi new file mode 100644 index 00000000000..2ae945c5974 --- /dev/null +++ b/python/stubs/cffi.pyi @@ -0,0 +1,4 @@ +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi new file mode 100644 index 00000000000..8d8fc35b134 --- /dev/null +++ b/python/stubs/compute.pyi @@ -0,0 +1,7779 @@ +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from collections.abc import Callable + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions + +# Functions +from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + Expression: + """Expression representing a scalar value. + + Creates an Expression object representing a scalar value that can be used + in compute expressions and predicates. + + Parameters + ---------- + value : bool, int, float or string + Python value of the scalar. This function accepts any value that can be + converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. + + Notes + ----- + This function differs from ``pyarrow.scalar()`` in the following way: + + * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents + a single value in Arrow's memory model. + * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing + a scalar value that can be used in compute expressions, predicates, and + dataset filtering operations. + + Returns + ------- + scalar_expr : Expression + An Expression representing the scalar value + """ + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: + """ + Test whether all elements in a boolean array evaluate to true. + + Null values are ignored by default. + If the `skip_nulls` option is set to false, then Kleene logic is used. + See "kleene_and" for more details on Kleene logic. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +any = _clone_signature(all) +""" +Test whether any element in a boolean array evaluates to true. + +Null values are ignored by default. +If the `skip_nulls` option is set to false, then Kleene logic is used. +See "kleene_or" for more details on Kleene logic. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Approximate median of a numeric array with T-Digest algorithm. + + Nulls and NaNs are ignored. + A null scalar is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of null / non-null values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of unique values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the first value in each group. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: + """ + Compute the first and last values of an array. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) + >>> pc.index(arr, "ipsum") + + >>> pc.index(arr, "ipsum", start=2) + + >>> pc.index(arr, "amet") + + """ + +last = _clone_signature(first) +""" +Compute the first and last values of an array. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True +In [15]: print(pc.last.__doc__) +Compute the first value in each group. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +max = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min_max = _clone_signature(first_last) +""" +Compute the minimum and maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def mean( + array: FloatScalar | FloatArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal128Scalar] + | lib.ChunkedArray[lib.Decimal128Scalar] + | lib.Decimal128Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal128Scalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal256Scalar] + | lib.ChunkedArray[lib.Decimal256Scalar] + | lib.Decimal256Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal256Scalar: ... +def mean(*args, **kwargs): + """ + Compute the mean of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + The result is a double for integer and floating point arguments, + and a decimal with the same bit-width/precision/scale for decimal arguments. + For integers and floats, NaN is returned if min_count = 0 and + there are no values. For decimals, null is returned instead. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: + """ + Compute the modal (most common) values of a numeric array. + + Compute the n most common values and their respective occurrence counts. + The output has type `struct`, where T is the + input type. + The results are ordered by descending `count` first, and ascending `mode` + when breaking ties. + Nulls are ignored. If there are no non-null values in the array, + an empty array is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ModeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the product of values in a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Compute an array of quantiles of a numeric array or chunked array. + + By default, 0.5 quantile (median) is returned. + If quantile lies between two data points, an interpolated value is + returned based on selected interpolation method. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.QuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the standard deviation of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population standard deviation is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: + """ + Compute the sum of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Approximate quantiles of a numeric array with T-Digest algorithm. + + By default, 0.5 quantile (median) is returned. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.TDigestOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +def variance( + array: NumericScalar | NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the variance of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population variance is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= +@overload +def abs( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def abs( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def abs(*args, **kwargs): + """ + Calculate the absolute value of the argument element-wise. + + Results will wrap around on integer overflow. + Use function "abs_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +abs_checked = _clone_signature(abs) +""" +Calculate the absolute value of the argument element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "abs". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def add( + x: _NumericOrTemporalScalarT, + y: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: NumericOrTemporalScalar, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def add(*args, **kwargs): + """ + Add the arguments element-wise. + + Results will wrap around on integer overflow. + Use function "add_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +add_checked = _clone_signature(add) +""" +Add the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "add". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +""" + +@overload +def divide( + dividend: _NumericOrTemporalScalarT, + divisor: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: Expression, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: Expression, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def divide(*args, **kwargs): + """ + Divide the arguments element-wise. + + Integer division by zero returns an error. However, integer overflow + wraps around, and floating-point division by zero returns an infinite. + Use function "divide_checked" if you want to get an error + in all the aforementioned cases. + + Parameters + ---------- + dividend : Array-like or scalar-like + Argument to compute function. + divisor : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +divide_checked = _clone_signature(divide) +""" +Divide the arguments element-wise. + +An error is returned when trying to divide by zero, or when +integer overflow is encountered. + +Parameters +---------- +dividend : Array-like or scalar-like + Argument to compute function. +divisor : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def exp( + exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT: ... +@overload +def exp( + exponent: ArrayOrChunkedArray[NonFloatNumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +@overload +def exp( + exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatScalarT: ... +@overload +def exp( + exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.DoubleScalar: ... +@overload +def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def exp(*args, **kwargs): + """ + Compute Euler's number raised to the power of specified exponent, element-wise. + + If exponent is null the result will be null. + + Parameters + ---------- + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +multiply = _clone_signature(add) +""" +Multiply the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "multiply_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +multiply_checked = _clone_signature(add) +""" +Multiply the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "multiply". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def negate( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def negate( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def negate(*args, **kwargs): + """ + Negate the argument element-wise. + + Results will wrap around on integer overflow. + Use function "negate_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +negate_checked = _clone_signature(negate) +""" +Negate the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "negate". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def power( + base: _NumericScalarT, + exponent: _NumericScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def power( + base: _NumericArrayT, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: Expression, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: _NumericArrayT, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: Expression, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def power(*args, **kwargs): + """ + Raise arguments to power element-wise. + + Integer to negative integer power returns an error. However, integer overflow + wraps around. If either base or exponent is null the result will be null. + + Parameters + ---------- + base : Array-like or scalar-like + Argument to compute function. + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +power_checked = _clone_signature(power) +""" +Raise arguments to power element-wise. + +An error is returned when integer to negative integer power is encountered, +or integer overflow is encountered. + +Parameters +---------- +base : Array-like or scalar-like + Argument to compute function. +exponent : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def sign( + x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] +): ... +@overload +def sign( + x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... +@overload +def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sign(*args, **kwargs): + """ + Get the signedness of the arguments element-wise. + + Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +@overload +def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... +@overload +def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... +@overload +def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sqrt(*args, **kwargs): + """ + Takes the square root of arguments element-wise. + + A negative argument returns a NaN. For a variant that returns an + error, use function "sqrt_checked". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +sqrt_checked = _clone_signature(sqrt) +""" +Takes the square root of arguments element-wise. + +A negative argument returns an error. For a variant that returns a +NaN, use function "sqrt". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +subtract = _clone_signature(add) +""" +Subtract the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "subtract_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subtract_checked = _clone_signature(add) +""" +Subtract the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "subtract". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.1 Bit-wise functions ========================= +@overload +def bit_wise_and( + x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, + y: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: Expression, + y: NumericScalar | ArrayOrChunkedArray[NumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: NumericScalar | ArrayOrChunkedArray[NumericScalar], + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def bit_wise_and(*args, **kwargs): + """ + Bit-wise AND the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def bit_wise_not( + x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_not( + x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def bit_wise_not(*args, **kwargs): + """ + Bit-wise negate the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +bit_wise_or = _clone_signature(bit_wise_and) +""" +Bit-wise OR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +bit_wise_xor = _clone_signature(bit_wise_and) +""" +Bit-wise XOR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +`x` is returned if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_left_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left_checked = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_left" for a variant that doesn't fail for an invalid shift amount. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +`x` is returned if `y` (the amount to shift by) is: (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_right_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right_checked = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_right" for a variant that doesn't fail for an invalid shift amount + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.2 Rounding functions ========================= +@overload +def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... +@overload +def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... +@overload +def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ceil(*args, **kwargs): + """ + Round up to the nearest integer. + + Compute the smallest integer value not less in magnitude than `x`. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor = _clone_signature(ceil) +""" +Round down to the nearest integer. + +Compute the largest integer value not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def round( + x: _NumericScalarT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round( + x: _NumericArrayT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round( + x: Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round(*args, **kwargs): + """ + Round to a given precision. + + Options are used to control the number of digits and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_to_multiple( + x: _NumericScalarT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_to_multiple( + x: _NumericArrayT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_to_multiple( + x: Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_to_multiple(*args, **kwargs): + """ + Round to a given multiple. + + Options are used to control the rounding multiple and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundToMultipleOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_binary( + x: _NumericScalarT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_binary( + x: _NumericScalarT, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[_NumericScalarT]: ... +@overload +def round_binary( + x: _NumericArrayT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_binary( + x: Expression, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_binary(*args, **kwargs): + """ + Round to the given precision. + + Options are used to control the rounding mode. + Default behavior is to use the half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + s : Array-like or scalar-like + Argument to compute function. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundBinaryOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +trunc = _clone_signature(ceil) +""" +Compute the integral part. + +Compute the nearest integer not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.3 Logarithmic functions ========================= +@overload +def ln( + x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def ln( + x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ln(*args, **kwargs): + """ + Compute natural logarithm. + + Non-positive values return -inf or NaN. Null values return null. + Use function "ln_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ln_checked = _clone_signature(ln) +""" +Compute natural logarithm. + +Non-positive values raise an error. Null values return null. +Use function "ln" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10 = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log10_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10_checked = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log10" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p_checked" if you want invalid values to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p_checked = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p" if you want invalid values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2 = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log2_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2_checked = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log2" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def logb( + x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def logb( + x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatScalar, + b: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatArray, + b: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression | Any: ... +def logb(*args, **kwargs): + """ + Compute base `b` logarithm. + + Values <= 0 return -inf or NaN. Null values return null. + Use function "logb_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + b : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +logb_checked = _clone_signature(logb) +""" +Compute base `b` logarithm. + +Values <= 0 return -inf or NaN. Null values return null. +Use function "logb" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +b : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +""" +Compute the inverse cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "acos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +acos_checked = _clone_signature(ln) +""" +Compute the inverse cosine. + +Invalid input values raise an error; +to return NaN instead, see "acos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin = _clone_signature(ln) +""" +Compute the inverse sine. + +NaN is returned for invalid input values; +to raise an error instead, see "asin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin_checked = _clone_signature(ln) +""" +Compute the inverse sine. + +Invalid input values raise an error; +to return NaN instead, see "asin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +atan = _clone_signature(ln) +""" +Compute the inverse tangent of x. + +The return value is in the range [-pi/2, pi/2]; +for a full return range [-pi, pi], see "atan2". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos = _clone_signature(ln) +""" +Compute the cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "cos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos_checked = _clone_signature(ln) +""" +Compute the cosine. + +Infinite values raise an error; +to return NaN instead, see "cos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin = _clone_signature(ln) +""" +Compute the sine. + +NaN is returned for invalid input values; +to raise an error instead, see "sin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin_checked = _clone_signature(ln) +""" +Compute the sine. + +Invalid input values raise an error; +to return NaN instead, see "sin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan = _clone_signature(ln) +""" +Compute the tangent. + +NaN is returned for invalid input values; +to raise an error instead, see "tan_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan_checked = _clone_signature(ln) +""" +Compute the tangent. + +Infinite values raise an error; +to return NaN instead, see "tan". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def atan2( + y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def atan2( + y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatArray, + x: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatScalar, + x: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def atan2( + y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def atan2(*args, **kwargs): + """ + Compute the inverse tangent of y/x. + + The return value is in the range [-pi, pi]. + + Parameters + ---------- + y : Array-like or scalar-like + Argument to compute function. + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.5 Comparisons functions ========================= +@overload +def equal( + x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def equal( + x: lib.Scalar, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: lib.Scalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: Expression, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def equal(*args, **kwargs): + """ + Compare values for equality (x == y). + + A null on either side emits a null comparison result. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +greater = _clone_signature(equal) +""" +Compare values for ordered inequality (x > y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +greater_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x >= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less = _clone_signature(equal) +""" +Compare values for ordered inequality (x < y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x <= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +not_equal = _clone_signature(equal) +""" +Compare values for inequality (x != y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT], + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT: ... +@overload +def max_element_wise( + *args: Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def max_element_wise(*args, **kwargs): + """ + Find the element-wise maximum value. + + Nulls are ignored (by default) or propagated. + NaN is preferred over null, but not over any valid value. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +min_element_wise = _clone_signature(max_element_wise) +""" +Find the element-wise minimum value. + +Nulls are ignored (by default) or propagated. +NaN is preferred over null, but not over any valid value. + +Parameters +---------- +*args : Array-like or scalar-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.6 Logical functions ========================= +@overload +def and_( + x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def and_( + x: BooleanArray, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: lib.BooleanScalar, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: BooleanArray, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: lib.BooleanScalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: Expression, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: ScalarOrArray[lib.BooleanScalar], + y: ScalarOrArray[lib.BooleanScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ScalarOrArray[lib.BooleanScalar]: ... +def and_(*args, **kwargs): + """ + Logical 'and' boolean values. + + When a null is encountered in either input, a null is output. + For a different null behavior, see function "and_kleene". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +and_kleene = _clone_signature(and_) +""" +Logical 'and' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and null = null +- null and true = null +- false and null = false +- null and false = false +- null and null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and' false is always false. +For a different null behavior, see function "and". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not = _clone_signature(and_) +""" +Logical 'and not' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "and_not_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not_kleene = _clone_signature(and_) +""" +Logical 'and not' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and not null = null +- null and not false = null +- false and not null = false +- null and not true = false +- null and not null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and not' true is always false, as is false +'and not' an unknown value. +For a different null behavior, see function "and_not". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_ = _clone_signature(and_) +""" +Logical 'or' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_kleene = _clone_signature(and_) +""" +Logical 'or' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true or null = true +- null or true = true +- false or null = null +- null or false = null +- null or null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'or' true is always true. +For a different null behavior, see function "or". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +xor = _clone_signature(and_) +""" +Logical 'xor' boolean values. + +When a null is encountered in either input, a null is output. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def invert( + x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def invert( + x: _BooleanArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _BooleanArrayT: ... +@overload +def invert( + x: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def invert(*args, **kwargs): + """ + Invert boolean values. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.10 String predicates ========================= +@overload +def ascii_is_alnum( + strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def ascii_is_alnum( + strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def ascii_is_alnum( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_is_alnum(*args, **kwargs): + """ + Classify strings as ASCII alphanumeric. + + For each string in `strings`, emit true iff the string is non-empty + and consists only of alphanumeric ASCII characters. Null strings emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alnum = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphanumeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphanumeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_digit = _clone_signature(ascii_is_alnum) +""" +Classify strings as digits. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of Unicode digits. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_numeric = _clone_signature(ascii_is_alnum) +""" +Classify strings as numeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of numeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +string_is_ascii = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII. + +For each string in `strings`, emit true iff the string consists only +of ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.11 String transforms ========================= +@overload +def ascii_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def ascii_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def ascii_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_capitalize(*args, **kwargs): + """ + Capitalize the first character of ASCII input. + + For each string in `strings`, return a capitalized version. + + This function assumes the input is fully ASCII. If it may contain + non-ASCII characters, use "utf8_capitalize" instead. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lower = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to lowercase. + +For each string in `strings`, return a lowercase version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_lower" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_reverse = _clone_signature(ascii_capitalize) +""" +Reverse ASCII input. + +For each ASCII string in `strings`, return a reversed version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_reverse" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_swapcase = _clone_signature(ascii_capitalize) +""" +Transform ASCII input by inverting casing. + +For each string in `strings`, return a string with opposite casing. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_swapcase" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_title = _clone_signature(ascii_capitalize) +""" +Titlecase each word of ASCII input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_title" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_upper = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to uppercase. + +For each string in `strings`, return an uppercase version. + +This function assumes the input is fully ASCII. It it may contain +non-ASCII characters, use "utf8_upper" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def binary_length( + strings: lib.LargeBinaryScalar | lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def binary_length( + strings: lib.BinaryArray + | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] + | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def binary_length( + strings: lib.LargeBinaryArray + | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] + | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def binary_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_length(*args, **kwargs): + """ + Compute string lengths. + + For each string in `strings`, emit its length of bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: int, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[_StringOrBinaryScalarT]: ... +@overload +def binary_repeat( + strings: _StringOrBinaryArrayT, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_repeat( + strings: Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_repeat(*args, **kwargs): + """ + Repeat a binary string. + + For each binary string in `strings`, return a replicated version. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + num_repeats : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_replace_slice( + strings: _StringOrBinaryScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_replace_slice( + strings: _StringOrBinaryArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_replace_slice(*args, **kwargs): + """ + Replace a slice of a binary string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_reverse( + strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT: ... +@overload +def binary_reverse( + strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryArrayT: ... +@overload +def binary_reverse( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def binary_reverse(*args, **kwargs): + """ + Reverse binary input. + + For each binary string in `strings`, return a reversed version. + + This function reverses the binary data at a byte-level. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def replace_substring( + strings: _StringScalarT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def replace_substring( + strings: _StringArrayT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def replace_substring( + strings: Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def replace_substring(*args, **kwargs): + """ + Replace matching non-overlapping substrings with replacement. + + For each string in `strings`, replace non-overlapping substrings that match + the given literal `pattern` with the given `replacement`. + If `max_replacements` is given and not equal to -1, it limits the + maximum amount replacements per input, counted from the left. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +replace_substring_regex = _clone_signature(replace_substring) +""" +Replace matching non-overlapping substrings with replacement. + +For each string in `strings`, replace non-overlapping substrings that match +the given regular expression `pattern` with the given `replacement`. +If `max_replacements` is given and not equal to -1, it limits the +maximum amount replacements per input, counted from the left. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +replacement : str + What to replace the pattern with. +max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). +options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def utf8_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def utf8_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def utf8_capitalize(*args, **kwargs): + """ + Capitalize the first character of input. + + For each string in `strings`, return a capitalized version, + with the first character uppercased and the others lowercased. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_length( + strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def utf8_length( + strings: lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def utf8_length( + strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def utf8_length( + strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def utf8_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_length(*args, **kwargs): + """ + Compute UTF8 string lengths. + + For each string in `strings`, emit its length in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_lower = _clone_signature(utf8_capitalize) +""" +Transform input to lowercase. + +For each string in `strings`, return a lowercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_replace_slice( + strings: _StringScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_replace_slice( + strings: _StringArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_replace_slice(*args, **kwargs): + """ + Replace a slice of a string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_reverse = _clone_signature(utf8_capitalize) +""" +Reverse input. + +For each string in `strings`, return a reversed version. + +This function operates on Unicode codepoints, not grapheme +clusters. Hence, it will not correctly reverse grapheme clusters +composed of multiple codepoints. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_swapcase = _clone_signature(utf8_capitalize) +""" +Transform input lowercase characters to uppercase and uppercase characters to lowercase. + +For each string in `strings`, return an opposite case version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_title = _clone_signature(utf8_capitalize) +""" +Titlecase each word of input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_upper = _clone_signature(utf8_capitalize) +""" +Transform input to uppercase. + +For each string in `strings`, return an uppercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory poo +""" + +# ========================= 2.12 String padding ========================= +@overload +def ascii_center( + strings: _StringScalarT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_center( + strings: _StringArrayT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_center( + strings: Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_center(*args, **kwargs): + """ + Center strings by padding with a given character. + + For each string in `strings`, emit a centered string by padding both sides + with the given ASCII character. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_center = _clone_signature(ascii_center) +""" +Center strings by padding with a given character. + +For each string in `strings`, emit a centered string by padding both sides +with the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.13 String trimming ========================= +@overload +def ascii_ltrim( + strings: _StringScalarT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim( + strings: _StringArrayT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim( + strings: Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim(*args, **kwargs): + """ + Trim leading characters. + + For each string in `strings`, remove any leading characters + from the `characters` option (as given in TrimOptions). + Null values emit null. + Both the `strings` and the `characters` are interpreted as + ASCII; to trim non-ASCII characters, use `utf8_ltrim`. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + characters : str + Individual characters to be trimmed from the string. + options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_rtrim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_trim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim = _clone_signature(ascii_ltrim) +""" +Trim leading characters. + +For each string in `strings`, remove any leading characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ascii_ltrim_whitespace( + strings: _StringScalarT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim_whitespace( + strings: _StringArrayT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim_whitespace( + strings: Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim_whitespace(*args, **kwargs): + """ + Trim leading ASCII whitespace characters. + + For each string in `strings`, emit a string with leading ASCII whitespace + characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode + whitespace characters. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with trailing ASCII whitespace +characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with leading and trailing ASCII +whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading whitespace characters. + +For each string in `strings`, emit a string with leading whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing whitespace characters. + +For each string in `strings`, emit a string with trailing whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing whitespace characters. + +For each string in `strings`, emit a string with leading and trailing +whitespace characters removed, where whitespace characters are defined +by the Unicode standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.14 String splitting ========================= +@overload +def ascii_split_whitespace( + strings: _StringScalarT, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT]: ... +@overload +def ascii_split_whitespace( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def ascii_split_whitespace( + strings: Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_split_whitespace(*args, **kwargs): + """ + Split string according to any ASCII whitespace. + + Split each string according any non-zero length sequence of ASCII + whitespace characters. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def split_pattern( + strings: _StringOrBinaryScalarT, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT]: ... +@overload +def split_pattern( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def split_pattern( + strings: Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def split_pattern(*args, **kwargs): + """ + Split string according to separator. + + Split each string according to the exact `pattern` defined in + SplitPatternOptions. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitPatternOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +split_pattern_regex = _clone_signature(split_pattern) +""" +Split string according to regex pattern. + +Split each string according to the regex `pattern` defined in +SplitPatternOptions. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitPatternOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + String pattern to split on. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) +""" +Split string according to any Unicode whitespace. + +Split each string according any non-zero length sequence of Unicode +whitespace characters. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.15 String component extraction ========================= +@overload +def extract_regex( + strings: StringOrBinaryScalar, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def extract_regex( + strings: StringOrBinaryArray, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def extract_regex( + strings: Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def extract_regex(*args, **kwargs): + """ + Extract substrings captured by a regex pattern. + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. If the input is null or the + regular expression fails matching, a null output value is emitted. + + Regular expression matching is done using the Google RE2 library. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: + """ + Join a list of strings together with a separator. + + Concatenate the strings in `list`. The `separator` is inserted + between each given string. + Any null input and any null `list` element emits a null output. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + separator : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryArrayT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_join_element_wise( + *strings: Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_join_element_wise(*args, **kwargs): + """ + Join string arguments together, with the last argument as separator. + + Concatenate the `strings` except for the last one. The last argument + in `strings` is inserted between each given string. + Any null separator element emits a null output. Null elements either + emit a null (the default), are skipped, or replaced with a given string. + + Parameters + ---------- + *strings : Array-like or scalar-like + Argument to compute function. + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + options : pyarrow.compute.JoinOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.17 String Slicing ========================= +@overload +def binary_slice( + strings: _BinaryScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT: ... +@overload +def binary_slice( + strings: _BinaryArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryArrayT: ... +@overload +def binary_slice( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_slice(*args, **kwargs): + """ + Slice binary string. + + For each binary string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + bytes. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_slice_codeunits( + strings: _StringScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_slice_codeunits( + strings: _StringArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_slice_codeunits( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_slice_codeunits(*args, **kwargs): + """ + Slice string. + + For each string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + UTF8 codeunits. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.18 Containment tests ========================= +@overload +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def count_substring( + strings: lib.LargeStringScalar | lib.LargeBinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def count_substring( + strings: lib.StringArray + | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] + | lib.ChunkedArray[lib.BinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def count_substring( + strings: lib.LargeStringArray + | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] + | lib.ChunkedArray[lib.LargeBinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def count_substring( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def count_substring(*args, **kwargs): + """ + Count occurrences of substring. + + For each string in `strings`, emit the number of occurrences of the given + literal pattern. + Null inputs emit null. The pattern must be given in MatchSubstringOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +count_substring_regex = _clone_signature(count_substring) +""" +Count occurrences of substring. + +For each string in `strings`, emit the number of occurrences of the given +regular expression pattern. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ends_with( + strings: StringScalar | BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def ends_with( + strings: StringArray | BinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def ends_with( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ends_with(*args, **kwargs): + """ + Check if strings end with a literal pattern. + + For each string in `strings`, emit true iff it ends with a given pattern. + The pattern must be given in MatchSubstringOptions. + If ignore_case is set, only simple case folding is performed. + + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +find_substring = _clone_signature(count_substring) +""" +Find first occurrence of substring. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +find_substring_regex = _clone_signature(count_substring) +""" +Find location of first match of regex pattern. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def index_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def index_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def index_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def index_in(*args, **kwargs): + """ + Return index of each element in a set of values. + + For each element in `values`, return its index in a given set of + values, or null if it is not found there. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_in(*args, **kwargs): + """ + Find each element in a set of values. + + For each element in `values`, return true if it is found in a given + set of values, false otherwise. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +match_like = _clone_signature(ends_with) +""" +Match strings against SQL-style LIKE pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. '%' will match any number of characters, '_' will +match exactly one character, and any other character matches itself. +To match a literal '%', '_', or '\', precede the character with a backslash. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring = _clone_signature(ends_with) +""" +Match strings against literal pattern. + +For each string in `strings`, emit true iff it contains a given pattern. +Null inputs emit null. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring_regex = _clone_signature(ends_with) +""" +Match strings against regex pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +starts_with = _clone_signature(ends_with) +""" +Check if strings start with a literal pattern. + +For each string in `strings`, emit true iff it starts with a given pattern. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.19 Categorizations ========================= +@overload +def is_finite( + values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_finite( + values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_finite( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_finite(*args, **kwargs): + """ + Return true if value is finite. + + For each input value, emit true iff the value is finite + (i.e. neither NaN, inf, nor -inf). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +is_inf = _clone_signature(is_finite) +""" +Return true if infinity. + +For each input value, emit true iff the value is infinite (inf or -inf). + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +is_nan = _clone_signature(is_finite) +""" +Return true if NaN. + +For each input value, emit true iff the value is NaN. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_null( + values: lib.Scalar, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_null( + values: lib.Array | lib.ChunkedArray, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_null( + values: Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_null(*args, **kwargs): + """ + Return true if null (and optionally NaN). + + For each input value, emit true iff the value is null. + True may also be emitted for NaN values by setting the `nan_is_null` flag. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + options : pyarrow.compute.NullOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_valid( + values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_valid( + values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_valid( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_valid(*args, **kwargs): + """ + Return true if non-null. + + For each input value, emit true iff the value is valid (i.e. non-null). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +true_unless_null = _clone_signature(is_valid) +""" +Return true if non-null, else return null. + +For each input value, emit true iff the value +is valid (non-null), otherwise emit null. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): + """ + Choose values based on multiple conditions. + + `cond` must be a struct of Boolean values. `cases` can be a mix + of scalar and array arguments (of any type, but all must be the + same type or castable to a common type), with either exactly one + datum per child of `cond`, or one more `cases` than children of + `cond` (in which case we have an "else" value). + + Each row of the output will be the corresponding value of the + first datum in `cases` for which the corresponding child of `cond` + is true, or otherwise the "else" value (if given), or null. + + Essentially, this implements a switch-case or if-else, if-else... statement. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + *cases : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): + """ + Choose values from several arrays. + + For each row, the value of the first argument is used as a 0-based index + into the list of `values` arrays (i.e. index 0 selects the first of the + `values` arrays). The output value is the corresponding value of the + selected argument. + + If an index is null, the output will be null. + + Parameters + ---------- + indices : Array-like or scalar-like + Argument to compute function. + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: + """ + Select the first non-null value. + + Each row of the output will be the value from the first corresponding input + for which the value is not null. If all inputs are null in a row, the output + will be null. + + Parameters + ---------- + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +fill_null = coalesce +"""Replace each null element in values with a corresponding +element from fill_value. + +If fill_value is scalar-like, then every null element in values +will be replaced with fill_value. If fill_value is array-like, +then the i-th element in values will be replaced with the i-th +element in fill_value. + +The fill_value's type must be the same as that of values, or it +must be able to be implicitly casted to the array's type. + +This is an alias for :func:`coalesce`. + +Parameters +---------- +values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. +fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + +Returns +------- +result : depends on inputs + Values with all null elements replaced + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array([1, 2, None, 3], type=pa.int8()) +>>> fill_value = pa.scalar(5, type=pa.int8()) +>>> arr.fill_null(fill_value) + +[ + 1, + 2, + 5, + 3 +] +>>> arr = pa.array([1, 2, None, 4, None]) +>>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + +[ + 1, + 2, + 30, + 4, + 50 +] +""" + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: + """ + Choose values based on a condition. + + `cond` must be a Boolean scalar/ array. + `left` or `right` must be of the same type scalar/ array. + `null` values in `cond` will be promoted to the output. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + left : Array-like or scalar-like + Argument to compute function. + right : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.21 Structural transforms ========================= + +@overload +def list_value_length( + lists: _ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def list_value_length( + lists: _LargeListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def list_value_length( + lists: ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array: ... +@overload +def list_value_length( + lists: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def list_value_length(*args, **kwargs): + """ + Compute list lengths. + + `lists` must have a list-like type. + For each non-null value in `lists`, its length is emitted. + Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def make_struct( + *args: lib.Scalar, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def make_struct( + *args: lib.Array | lib.ChunkedArray, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def make_struct( + *args: Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def make_struct(*args, **kwargs): + """ + Wrap Arrays into a StructArray. + + Names of the StructArray's fields are + specified through MakeStructOptions. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + options : pyarrow.compute.MakeStructOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.22 Conversions ========================= +@overload +def ceil_temporal( + timestamps: _TemporalScalarT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT: ... +@overload +def ceil_temporal( + timestamps: _TemporalArrayT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalArrayT: ... +@overload +def ceil_temporal( + timestamps: Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ceil_temporal(*args, **kwargs): + """ + Round temporal values up to nearest multiple of specified time unit. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values down to nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +round_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values to the nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def cast( + arr: lib.Scalar, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT]: ... +@overload +def cast( + arr: lib.Array, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def cast( + arr: lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +def cast(*args, **kwargs): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp("ms")) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp("ms")).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast("timestamp[ms]") + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast("timestamp[ms]").type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + +@overload +def strftime( + timestamps: TemporalScalar, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar: ... +@overload +def strftime( + timestamps: TemporalArray, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringArray: ... +@overload +def strftime( + timestamps: Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strftime(*args, **kwargs): + """ + Format temporal values according to a format string. + + For each input value, emit a formatted string. + The time format string and locale can be set using StrftimeOptions. + The output precision of the "%S" (seconds) format code depends on + the input time precision: it is an integer for timestamps with + second precision, a real number with the required number of fractional + digits for higher precisions. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database, or if the specified locale + does not exist on this system. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + options : pyarrow.compute.StrftimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def strptime( + strings: StringScalar, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def strptime( + strings: StringArray, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def strptime( + strings: Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strptime(*args, **kwargs): + """ + Parse timestamps. + + For each string in `strings`, parse it as a timestamp. + The timestamp unit and the expected string pattern must be given + in StrptimeOptions. Null inputs emit null. If a non-null string + fails parsing, an error is returned by default. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + options : pyarrow.compute.StrptimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.23 Temporal component extraction ========================= +@overload +def day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +@overload +def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def day(*args, **kwargs): + """ + Extract day number. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def day_of_week( + values: TemporalScalar, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def day_of_week( + values: TemporalArray, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def day_of_week( + values: Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def day_of_week(*args, **kwargs): + """ + Extract day of the week number. + + By default, the week starts on Monday represented by 0 and ends on Sunday + represented by 6. + `DayOfWeekOptions.week_start` can be used to set another starting day using + the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). + Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +day_of_year = _clone_signature(day) +""" +Extract day of year number. + +January 1st maps to day number 1, February 1st to 32, etc. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def hour( + values: lib.TimestampArray[Any] + | lib.Time32Array[Any] + | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def hour( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def hour(*args, **kwargs): + """ + Extract hour value. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_dst( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_dst( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def is_dst(*args, **kwargs): + """ + Extracts if currently observing daylight savings. + + IsDaylightSavings returns true if a timestamp has a daylight saving + offset in the given timezone. + Null values emit null. + An error is returned if the values do not have a defined timezone. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def iso_week( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def iso_week( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def iso_week( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def iso_week(*args, **kwargs): + """ + Extract ISO week of year number. + + First ISO week has the majority (4 or more) of its days in January. + ISO week starts on Monday. The week number starts with 1 and can run + up to 53. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +iso_year = _clone_signature(iso_week) +""" +Extract ISO year number. + +First week of an ISO year has the majority (4 or more) of its days in January. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_leap_year( + values: lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_leap_year( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_leap_year(*args, **kwargs): + """ + Extract if year is a leap year. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +microsecond = _clone_signature(iso_week) +""" +Extract microsecond values. + +Microsecond returns number of microseconds since the last full millisecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +millisecond = _clone_signature(iso_week) +""" +Extract millisecond values. + +Millisecond returns number of milliseconds since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minute = _clone_signature(iso_week) +""" +Extract minute values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +month = _clone_signature(day_of_week) +""" +Extract month number. + +Month is encoded as January=1, December=12. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +nanosecond = _clone_signature(hour) +""" +Extract nanosecond values. + +Nanosecond returns number of nanoseconds since the last full microsecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarter = _clone_signature(day_of_week) +""" +Extract quarter of year number. + +First quarter maps to 1 and forth quarter maps to 4. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +second = _clone_signature(hour) +""" +Extract second values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subsecond = _clone_signature(hour) +""" +Extract subsecond values. + +Subsecond returns the fraction of a second since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_week = _clone_signature(iso_week) +""" +Extract US week of year number. + +First US week has the majority (4 or more) of its days in January. +US week starts on Monday. The week number starts with 1 and can run +up to 53. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_year = _clone_signature(iso_week) +""" +Extract US epidemiological year number. + +First week of US epidemiological year has the majority (4 or more) of +it's days in January. Last week of US epidemiological year has the +year's last Wednesday in it. US epidemiological week starts on Sunday. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +year = _clone_signature(iso_week) +""" +Extract year number. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def week( + values: lib.TimestampScalar, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def week( + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def week( + values: Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def week(*args, **kwargs): + """ + Extract week of year number. + + First week has the majority (4 or more) of its days in January. + Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using + DayOfWeekOptions.count_from_zero. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + options : pyarrow.compute.WeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def year_month_day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar: ... +@overload +def year_month_day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def year_month_day( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def year_month_day(*args, **kwargs): + """ + Extract (year, month, day) struct. + + Null values emit null. + An error is returned in the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of days and milliseconds between two timestamps. + + Returns the number of days and milliseconds from `start` to `end`. + That is, first the difference in days is computed as if both + timestamps were truncated to the day, then the difference between time times + of the two timestamps is computed as if both times were truncated to the + millisecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of days between two timestamps. + + Returns the number of day boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the day. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +hours_between = _clone_signature(days_between) +""" +Compute the number of hours between two timestamps. + +Returns the number of hour boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the hour. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +microseconds_between = _clone_signature(days_between) +""" +Compute the number of microseconds between two timestamps. + +Returns the number of microsecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the microsecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +milliseconds_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minutes_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. +In [152]: print(pc.minutes_between.__doc__) +Compute the number of minute boundaries between two timestamps. + +Returns the number of minute boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the minute. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: + """ + Compute the number of months, days and nanoseconds between two timestamps. + + Returns the number of months, days, and nanoseconds from `start` to `end`. + That is, first the difference in months is computed as if both timestamps + were truncated to the months, then the difference between the days + is computed, and finally the difference between the times of the two + timestamps is computed as if both times were truncated to the nanosecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of months between two timestamps. + + Returns the number of month boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the month. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +nanoseconds_between = _clone_signature(days_between) +""" +Compute the number of nanoseconds between two timestamps. + +Returns the number of nanosecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the nanosecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarters_between = _clone_signature(days_between) +""" +Compute the number of quarters between two timestamps. + +Returns the number of quarter start boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the quarter. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +seconds_between = _clone_signature(days_between) +""" +Compute the number of seconds between two timestamps. + +Returns the number of second boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the second. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of weeks between two timestamps. + + Returns the number of week boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the week. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +years_between = _clone_signature(days_between) +""" +Compute the number of years between two timestamps. + +Returns the number of year boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the year. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.25 Timezone handling ========================= +@overload +def assume_timezone( + timestamps: lib.TimestampScalar, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def assume_timezone( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def assume_timezone( + timestamps: Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def assume_timezone(*args, **kwargs): + """ + Convert naive timestamp to timezone-aware timestamp. + + Input timestamps are assumed to be relative to the timezone given in the + `timezone` option. They are converted to UTC-relative timestamps and + the output type has its timezone set to the value of the `timezone` + option. Null values emit null. + This function is meant to be used when an external system produces + "timezone-naive" timestamps which need to be converted to + "timezone-aware" timestamps. An error is returned if the timestamps + already have a defined timezone. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + options : pyarrow.compute.AssumeTimezoneOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def local_timestamp( + timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar: ... +@overload +def local_timestamp( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def local_timestamp( + timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def local_timestamp(*args, **kwargs): + """ + Convert timestamp to a timezone-naive local time timestamp. + + LocalTimestamp converts timezone-aware timestamp to local timestamp + of the given timestamp's timezone and removes timezone metadata. + Alternative name for this timestamp is also wall clock time. + If input is in UTC or without timezone, then unchanged input values + without timezone metadata are returned. + Null values emit null. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +@overload +def cumulative_sum( + values: _NumericArrayT, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def cumulative_sum( + values: Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def cumulative_sum(*args, **kwargs): + """ + Compute the cumulative sum over a numeric input. + + `values` must be numeric. Return an array/chunked array which is the + cumulative sum computed over `values`. Results will wrap around on + integer overflow. Use function "cumulative_sum_checked" if you want + overflow to return an error. The default start is 0. + + Parameters + ---------- + values : Array-like + Argument to compute function. + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +cumulative_sum_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative sum over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative sum computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_sum". The default start is 0. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. Results will wrap around on +integer overflow. Use function "cumulative_prod_checked" if you want +overflow to return an error. The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_prod". The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_max = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_min = _clone_signature(cumulative_sum) +""" +Compute the cumulative min over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative min computed over `values`. The default start is the maximum +value of input type (so that any other value will replace the +start as the new minimum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_mean = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +# ========================= 3.2 Associative transforms ========================= + +@overload +def dictionary_encode( + array: _ScalarOrArrayT, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT: ... +@overload +def dictionary_encode( + array: Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload +def value_counts( + array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def value_counts( + array: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +# ========================= 3.3 Selections ========================= +@overload +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def array_take( + array: _ArrayT, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +filter = array_filter +take = array_take +""" +Select values (or records) from array- or table-like data given integer +selection indices. + +The result will be of the same type(s) as the input, with elements taken +from the input array (or record batch / table fields) at the given +indices. If an index is null then the corresponding value in the output +will be null. + +Parameters +---------- +data : Array, ChunkedArray, RecordBatch, or Table +indices : Array, ChunkedArray + Must be of integer type +boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. +memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +Returns +------- +result : depends on inputs + Selected values for the given indices + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array(["a", "b", "c", None, "e", "f"]) +>>> indices = pa.array([0, None, 4, 3]) +>>> arr.take(indices) + +[ + "a", + null, + "e", + null +] +""" + +# ========================= 3.4 Containment tests ========================= +@overload +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def indices_nonzero( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def indices_nonzero(*args, **kwargs): + """ + Return the indices of the values in the array that are non-zero. + + For each input value, check if it's zero, false or null. Emit the index + of the value in the array if it's none of the those. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.5 Sorts and partitions ========================= +@overload +def array_sort_indices( + array: lib.Array | lib.ChunkedArray, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def array_sort_indices( + array: Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def array_sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array. + + This function computes an array of indices that define a stable sort + of the input array. By default, Null values are considered greater + than any other value and are therefore sorted at the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in ArraySortOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.ArraySortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def partition_nth_indices( + array: Expression, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def partition_nth_indices(*args, **kwargs): + """ + Return the indices that would partition an array around a pivot. + + This functions computes an array of indices that define a non-stable + partial sort of the input array. + + The output is such that the `N`'th index points to the `N`'th element + of the input in sorted order, and all indices before the `N`'th point + to elements in the input less or equal to elements at or after the `N`'th. + + By default, null values are considered greater than any other value + and are therefore partitioned towards the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The pivot index `N` must be given in PartitionNthOptions. + The handling of nulls and NaNs can also be changed in PartitionNthOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.PartitionNthOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: + """ + Compute ordinal ranks of an array (1-based). + + This function computes a rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. The default tiebreaker is to assign ranks in order of + when ties appear in the input. + + The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + options : pyarrow.compute.RankOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def select_k_unstable( + input: lib.Array | lib.ChunkedArray, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def select_k_unstable( + input: Expression, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def select_k_unstable(*args, **kwargs): + """ + Select the indices of the first `k` ordered elements from the input. + + This function selects an array of indices of the first `k` ordered elements + from the `input` array, record batch or table specified in the column keys + (`options.sort_keys`). Output is not guaranteed to be stable. + Null values are considered greater than any other value and are + therefore ordered at the end. For floating-point types, NaNs are considered + greater than any other non-null value, but smaller than null values. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + options : pyarrow.compute.SelectKOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def sort_indices( + input: Expression, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array, record batch or table. + + This function computes an array of indices that define a stable sort + of the input array, record batch or table. By default, null values are + considered greater than any other value and are therefore sorted at the + end of the input. For floating-point types, NaNs are considered greater + than any other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in SortOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.SortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.6 Structural transforms ========================= +@overload +def list_element( + lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: lib.ChunkedArray[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: ListScalar[_DataTypeT], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _DataTypeT: ... +def list_element(*args, **kwargs): + """ + Compute elements using of nested list values using an index. + + `lists` must have a list-like type. + For each value in each list of `lists`, the element at `index` + is emitted. Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + index : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_flatten( + lists: Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]], + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_flatten(*args, **kwargs): + """ + Flatten list values. + + `lists` must have a list-like type (lists, list-views, and + fixed-size lists). + Return an array with the top list level flattened unless + `recursive` is set to true in ListFlattenOptions. When that + is that case, flattening happens recursively until a non-list + array is formed. + + Null list values do not emit anything to the output. + + Parameters + ---------- + lists : Array-like + Argument to compute function. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + options : pyarrow.compute.ListFlattenOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_parent_indices( + lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_parent_indices( + lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +def list_parent_indices(*args, **kwargs): + """ + Compute parent indices of nested list values. + + `lists` must have a list-like or list-view type. + For each value in each list of `lists`, the top-level list index + is emitted. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_slice( + lists: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_slice( + lists: ArrayOrChunkedArray[Any], + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_slice(*args, **kwargs): + """ + Compute slice of list-like array. + + `lists` must have a list-like type. + For each list element, compute a slice, returning a new list array. + A variable or fixed size list array is returned, depending on options. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + options : pyarrow.compute.ListSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Find the items corresponding to a given key in a Map. + + For a given query key (passed via MapLookupOptions), extract + either the FIRST, LAST or ALL items from a Map that have + matching keys. + + Parameters + ---------- + container : Array-like or scalar-like + Argument to compute function. + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + options : pyarrow.compute.MapLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Extract children of a struct or union by index. + + Given a list of indices (passed via StructFieldOptions), extract + the child array or scalar with the given child index, recursively. + + For union inputs, nulls are emitted for union values that reference + a different child than specified. Also, the indices are always + in physical order, not logical type codes - for example, the first + child is always index 0. + + An empty list of indices returns the argument unchanged. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + options : pyarrow.compute.StructFieldOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values backward to fill null slots. + + Given an array, propagate next valid observation backward to previous valid + or nothing if all next values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values forward to fill null slots. + + Given an array, propagate last valid observation forward to next valid + or nothing if all previous values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): + """ + Replace items selected with a mask. + + Given an array and a boolean mask (either scalar or of equal length), + along with replacement values (either scalar or array), + each element of the array for which the corresponding mask element is + true will be replaced by the next value from the replacements, + or with null if the mask is null. + Hence, for replacement arrays, len(replacements) == sum(mask == true). + + Parameters + ---------- + values : Array-like + Argument to compute function. + mask : Array-like + Argument to compute function. + replacements : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.7 Pairwise functions ========================= +@overload +def pairwise_diff( + input: _NumericOrTemporalArrayT, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def pairwise_diff( + input: Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def pairwise_diff(*args, **kwargs): + """ + Compute first order difference of an array. + + Computes the first order difference of an array, It internally calls + the scalar function "subtract" to compute + differences, so its + behavior and supported types are the same as + "subtract". The period can be specified in :struct:`PairwiseOptions`. + + Results will wrap around on integer overflow. Use function + "pairwise_diff_checked" if you want overflow to return an error. + + Parameters + ---------- + input : Array-like + Argument to compute function. + period : int, default 1 + Period for applying the period function. + options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +pairwise_diff_checked = _clone_signature(pairwise_diff) +""" +Compute first order difference of an array. + +Computes the first order difference of an array, It internally calls +the scalar function "subtract_checked" (or the checked variant) to compute +differences, so its behavior and supported types are the same as +"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. + +This function returns an error on overflow. For a variant that doesn't +fail on overflow, use function "pairwise_diff". + +Parameters +---------- +input : Array-like + Argument to compute function. +period : int, default 1 + Period for applying the period function. +options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" diff --git a/python/stubs/csv.pyi b/python/stubs/csv.pyi new file mode 100644 index 00000000000..510229d7e72 --- /dev/null +++ b/python/stubs/csv.pyi @@ -0,0 +1,27 @@ +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/stubs/cuda.pyi b/python/stubs/cuda.pyi new file mode 100644 index 00000000000..e11baf7d4e7 --- /dev/null +++ b/python/stubs/cuda.pyi @@ -0,0 +1,25 @@ +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/stubs/dataset.pyi b/python/stubs/dataset.pyi new file mode 100644 index 00000000000..98f1a38aa85 --- /dev/null +++ b/python/stubs/dataset.pyi @@ -0,0 +1,229 @@ +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: StrPath | Sequence[StrPath], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... +@overload +def dataset( + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +def write_dataset( + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/python/stubs/feather.pyi b/python/stubs/feather.pyi new file mode 100644 index 00000000000..9451ee15763 --- /dev/null +++ b/python/stubs/feather.pyi @@ -0,0 +1,50 @@ +from typing import IO, Literal + +import pandas as pd + +from _typeshed import StrPath +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + +def check_chunked_overflow(name: str, col) -> None: ... +def write_feather( + df: pd.DataFrame | Table, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... +def read_feather( + source: StrPath | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... +def read_table( + source: StrPath | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... diff --git a/python/stubs/flight.pyi b/python/stubs/flight.pyi new file mode 100644 index 00000000000..9b806ccf305 --- /dev/null +++ b/python/stubs/flight.pyi @@ -0,0 +1,95 @@ +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/stubs/fs.pyi b/python/stubs/fs.pyi new file mode 100644 index 00000000000..6bf75616c13 --- /dev/null +++ b/python/stubs/fs.pyi @@ -0,0 +1,77 @@ +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, + use_threads: bool = True, +) -> None: ... + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/stubs/gandiva.pyi b/python/stubs/gandiva.pyi new file mode 100644 index 00000000000..a344f885b29 --- /dev/null +++ b/python/stubs/gandiva.pyi @@ -0,0 +1,65 @@ +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/stubs/interchange/__init__.pyi b/python/stubs/interchange/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/stubs/interchange/buffer.pyi b/python/stubs/interchange/buffer.pyi new file mode 100644 index 00000000000..46673961a75 --- /dev/null +++ b/python/stubs/interchange/buffer.pyi @@ -0,0 +1,58 @@ +import enum + +from pyarrow.lib import Buffer + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ diff --git a/python/stubs/interchange/column.pyi b/python/stubs/interchange/column.pyi new file mode 100644 index 00000000000..e6662867b6b --- /dev/null +++ b/python/stubs/interchange/column.pyi @@ -0,0 +1,252 @@ +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + @property + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ diff --git a/python/stubs/interchange/dataframe.pyi b/python/stubs/interchange/dataframe.pyi new file mode 100644 index 00000000000..526a58926a9 --- /dev/null +++ b/python/stubs/interchange/dataframe.pyi @@ -0,0 +1,102 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + def select_columns(self, indices: Sequence[int]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + def select_columns_by_name(self, names: Sequence[str]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ diff --git a/python/stubs/interchange/from_dataframe.pyi b/python/stubs/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..b04b6268975 --- /dev/null +++ b/python/stubs/interchange/from_dataframe.pyi @@ -0,0 +1,244 @@ +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + +ColumnObject: TypeAlias = Any + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_attendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... } + ... ) + >>> df + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_attendees: int64 + country: large_string + ---- + n_attendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + """ + +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: + """Parse datetime `format_str` to interpret the `data`.""" + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: + """Map column date type to pyarrow date type.""" + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ diff --git a/python/stubs/ipc.pyi b/python/stubs/ipc.pyi new file mode 100644 index 00000000000..c7f2af004d4 --- /dev/null +++ b/python/stubs/ipc.pyi @@ -0,0 +1,123 @@ +from io import IOBase + +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... +def open_stream( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchFileWriter: ... +def open_file( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... +def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/stubs/json.pyi b/python/stubs/json.pyi new file mode 100644 index 00000000000..db1d35e0b8b --- /dev/null +++ b/python/stubs/json.pyi @@ -0,0 +1,3 @@ +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/stubs/lib.pyi b/python/stubs/lib.pyi new file mode 100644 index 00000000000..1698b55520b --- /dev/null +++ b/python/stubs/lib.pyi @@ -0,0 +1,106 @@ +# ruff: noqa: F403 +from typing import NamedTuple + +from .__lib_pxi.array import * +from .__lib_pxi.benchmark import * +from .__lib_pxi.builder import * +from .__lib_pxi.compat import * +from .__lib_pxi.config import * +from .__lib_pxi.device import * +from .__lib_pxi.error import * +from .__lib_pxi.io import * +from .__lib_pxi.ipc import * +from .__lib_pxi.memory import * +from .__lib_pxi.pandas_shim import * +from .__lib_pxi.scalar import * +from .__lib_pxi.table import * +from .__lib_pxi.tensor import * +from .__lib_pxi.types import * + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + +def cpu_count() -> int: + """ + Return the number of threads to use in parallel operations. + + The number of threads is determined at startup by inspecting the + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. + + See Also + -------- + set_cpu_count : Modify the size of this pool. + io_thread_count : The analogous function for the I/O thread pool. + """ + +def set_cpu_count(count: int) -> None: + """ + Set the number of threads to use in parallel operations. + + Parameters + ---------- + count : int + The number of concurrent threads that should be used. + + See Also + -------- + cpu_count : Get the size of this pool. + set_io_thread_count : The analogous function for the I/O thread pool. + """ + +def is_threading_enabled() -> bool: + """ + Returns True if threading is enabled in libarrow. + + If it isn't enabled, then python shouldn't create any + threads either, because we're probably on a system where + threading doesn't work (e.g. Emscripten). + """ + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/stubs/orc.pyi b/python/stubs/orc.pyi new file mode 100644 index 00000000000..2eba8d40a11 --- /dev/null +++ b/python/stubs/orc.pyi @@ -0,0 +1,279 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + +class ORCFile: + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: + """The file metadata, as an arrow KeyValueMetadata""" + @property + def schema(self) -> Schema: + """The file schema, as an arrow schema""" + @property + def nrows(self) -> int: + """The number of rows in the file""" + @property + def nstripes(self) -> int: + """The number of stripes in the file""" + @property + def file_version(self) -> str: + """Format version of the ORC file, must be 0.11 or 0.12""" + @property + def software_version(self) -> str: + """Software instance and version that wrote this file""" + @property + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: + """Compression codec of the file""" + @property + def compression_size(self) -> int: + """Number of bytes to buffer for the compression codec in the file""" + @property + def writer(self) -> str: + """Name of the writer that wrote this file. + If the writer is unknown then its Writer ID + (a number) is returned""" + @property + def writer_version(self) -> str: + """Version of the writer""" + @property + def row_index_stride(self) -> int: + """Number of rows per an entry in the row index or 0 + if there is no row index""" + @property + def nstripe_statistics(self) -> int: + """Number of stripe statistics""" + @property + def content_length(self) -> int: + """Length of the data stripes in the file in bytes""" + @property + def stripe_statistics_length(self) -> int: + """The number of compressed bytes in the file stripe statistics""" + @property + def file_footer_length(self) -> int: + """The number of compressed bytes in the file footer""" + @property + def file_postscript_length(self) -> int: + """The number of bytes in the file postscript""" + @property + def file_length(self) -> int: + """The number of bytes in the file""" + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.RecordBatch + Content of the stripe as a RecordBatch. + """ + def read(self, columns: list[str] | None = None) -> Table: + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. Output always follows the + ordering of the file and not the `columns` list. + + Returns + ------- + pyarrow.Table + Content of the file as a Table. + """ + +class ORCWriter: + """ + Writer interface for a single ORC file + + Parameters + ---------- + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ + + writer: _orc.ORCWriter + is_open: bool + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: + """ + Write the table into an ORC file. The schema of the table must + be equal to the schema used when opening the ORC file. + + Parameters + ---------- + table : pyarrow.Table + The table to be written into the ORC file + """ + def close(self) -> None: + """ + Close the ORC file + """ + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Table: + """ + Read a Table from an ORC file. + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name. For file-like objects, + only read a single file. Use pyarrow.BufferReader to read a file + contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. Output always follows the ordering of the file and + not the `columns` list. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + """ + +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, +) -> None: + """ + Write a table into an ORC file. + + Parameters + ---------- + table : pyarrow.lib.Table + The table to be written into the ORC file + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ diff --git a/python/stubs/pandas_compat.pyi b/python/stubs/pandas_compat.pyi new file mode 100644 index 00000000000..efbd05ac2fe --- /dev/null +++ b/python/stubs/pandas_compat.pyi @@ -0,0 +1,54 @@ +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/stubs/parquet/__init__.pyi b/python/stubs/parquet/__init__.pyi new file mode 100644 index 00000000000..4ef88705809 --- /dev/null +++ b/python/stubs/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * # noqa diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi new file mode 100644 index 00000000000..56b2c8447d9 --- /dev/null +++ b/python/stubs/parquet/core.pyi @@ -0,0 +1,2061 @@ +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: + """ + Check if filters are well-formed and convert to an ``Expression``. + + Parameters + ---------- + filters : List[Tuple] or List[List[Tuple]] + + Notes + ----- + See internal ``pyarrow._DNF_filter_doc`` attribute for more details. + + Examples + -------- + + >>> filters_to_expression([("foo", "==", "bar")]) + + + Returns + ------- + pyarrow.compute.Expression + An Expression representing the filters + """ + +@deprecated("use filters_to_expression") +def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + +_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] + +class ParquetFile: + """ + Reader interface for a single Parquet file. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarrow.BufferReader. + metadata : FileMetaData, default None + Use existing metadata object, rather than reading from file. + common_metadata : FileMetaData, default None + Will be used in reads for pandas schema metadata if not found in the + main file's metadata, no other uses at the moment. + read_dictionary : list + List of column names to read directly as DictionaryArray. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + pre_buffer : bool, default False + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties, default None + File decryption properties for Parquet Modular Encryption. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Examples + -------- + + Generate an example PyArrow Table and write it to Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Create a ``ParquetFile`` object from the Parquet file: + + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the data: + + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Create a ParquetFile object with "animal" column as DictionaryArray: + + >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [ -- dictionary: + ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: + [0,1,2,3,4,5]] + """ + + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: + """ + Return the Parquet metadata. + """ + @property + def schema(self) -> ParquetSchema: + """ + Return the Parquet schema, unconverted to Arrow types + """ + @property + def schema_arrow(self) -> Schema: + """ + Return the inferred Arrow schema, converted from the whole Parquet + file's schema + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the Arrow schema: + + >>> parquet_file.schema_arrow + n_legs: int64 + animal: string + """ + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups of the Parquet file. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.num_row_groups + 1 + """ + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + def read_row_group( + self, + i: int, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a single row group from a Parquet file. + + Parameters + ---------- + i : int + Index of the individual row group that we want to read. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row group as a table (of columns) + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_group(0) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def read_row_groups( + self, + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a multiple row groups from a Parquet file. + + Parameters + ---------- + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row groups as a table (of columns). + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_groups([0, 0]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] + """ + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: + """ + Read streaming batches from a Parquet file. + + Parameters + ---------- + batch_size : int, default 64K + Maximum number of records to yield per batch. Batches may be + smaller if there aren't enough rows in the file. + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : boolean, default True + Perform multi-threaded column reads. + use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Yields + ------ + pyarrow.RecordBatch + Contents of each batch as a record batch + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + >>> for i in parquet_file.iter_batches(): + ... print("RecordBatch") + ... print(i.to_pandas()) + RecordBatch + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + def read( + self, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a Table from Parquet format. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read a Table: + + >>> parquet_file.read(columns=["animal"]) + pyarrow.Table + animal: string + ---- + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: + """ + Read contents of file for the given columns and batch size. + + Notes + ----- + This function's primary purpose is benchmarking. + The scan is executed on a single thread. + + Parameters + ---------- + columns : list of integers, default None + Select columns to read, if None scan all columns. + batch_size : int, default 64K + Number of rows to read at a time internally. + + Returns + ------- + num_rows : int + Number of rows in file + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.scan_contents() + 6 + """ + +class ParquetWriter: + """ + Class for incrementally building a Parquet file for Arrow tables. + + Parameters + ---------- + where : path or file-like object + schema : pyarrow.Schema + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + writer_engine_version : unused + **options : dict + If options contains a key `metadata_collector` then the + corresponding value is assumed to be a list (or any object with + `.append` method) that will be filled with the file metadata instance + of the written file. + + Examples + -------- + Generate an example PyArrow Table and RecordBatch: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.record_batch( + ... [ + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... ], + ... names=["n_legs", "animal"], + ... ) + + create a ParquetWriter object: + + >>> import pyarrow.parquet as pq + >>> writer = pq.ParquetWriter("example.parquet", table.schema) + + and write the Table into the Parquet file: + + >>> writer.write_table(table) + >>> writer.close() + + >>> pq.read_table("example.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + create a ParquetWriter object for the RecordBatch: + + >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) + + and write the RecordBatch into the Parquet file: + + >>> writer2.write_batch(batch) + >>> writer2.close() + + >>> pq.read_table("example2.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: + """ + Write RecordBatch or Table to the Parquet file. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the input + table or batch length and 1024 * 1024. + """ + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: + """ + Write RecordBatch to the Parquet file. + + Parameters + ---------- + batch : RecordBatch + row_group_size : int, default None + Maximum number of rows in written row group. If None, the + row group size will be the minimum of the RecordBatch + size and 1024 * 1024. If set larger than 64Mi then 64Mi + will be used instead. + """ + def write_table(self, table: Table, row_group_size: int | None = None) -> None: + """ + Write Table to the Parquet file. + + Parameters + ---------- + table : Table + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the Table size + and 1024 * 1024. If set larger than 64Mi then 64Mi will + be used instead. + + """ + def close(self) -> None: + """ + Close the connection to the Parquet file. + """ + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: + """ + Add key-value metadata to the file. + This will overwrite any existing metadata with the same key. + + Parameters + ---------- + key_value_metadata : dict + Keys and values must be string-like / coercible to bytes. + """ + +class ParquetDataset: + """ + Encapsulates details of reading a complete Parquet dataset possibly + consisting of multiple files and partitions in subdirectories. + + Parameters + ---------- + path_or_paths : str or List[str] + A directory name, single file name, or list of file names. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : pyarrow.parquet.Schema + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular resolution + (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 + timestamps will be inferred as timestamps in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + + Examples + -------- + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) + + create a ParquetDataset object from the dataset source: + + >>> dataset = pq.ParquetDataset("dataset_v2/") + + and read the data: + + >>> dataset.read().to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + create a ParquetDataset object with filter: + + >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) + >>> dataset.read().to_pandas() + n_legs animal year + 0 4 Dog 2021 + 1 4 Horse 2022 + """ + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: + """ + Schema of the Dataset. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_schema/") + + Read the schema: + + >>> dataset.schema + n_legs: int64 + animal: string + year: dictionary + """ + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read (multiple) Parquet files as a single pyarrow.Table. + + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. The partition fields + are not automatically included. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_read/") + + Read the dataset: + + >>> dataset.read(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[5],[2],[4,100],[2,4]] + """ + def read_pandas(self, **kwargs) -> Table: + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` + + Examples + -------- + Generate an example parquet file: + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "table_V2.parquet") + >>> dataset = pq.ParquetDataset("table_V2.parquet") + + Read the dataset with pandas metadata: + + >>> dataset.read_pandas(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,2,4,4,5,100]] + + >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} + """ + @property + def fragments(self) -> list[ParquetFileFragment]: + """ + A list of the Dataset source fragments or pieces with absolute + file paths. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") + + List the fragments: + + >>> dataset.fragments + [ list[str]: + """ + A list of absolute Parquet file paths in the Dataset source. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_files/") + + List the files: + + >>> dataset.files + ['dataset_v2_files/year=2019/...-0.parquet', ... + """ + @property + def filesystem(self) -> FileSystem: + """ + The filesystem type of the Dataset source. + """ + @property + def partitioning(self) -> Partitioning: + """ + The partitioning of the Dataset source, if discovered. + """ + +def read_table( + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + filesystem: SupportedFileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: + """ + Read a Table from Parquet format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + + + Examples + -------- + + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) + + Read the data: + + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + + Read only a subset of columns: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] + + Read a subset of columns and read one column as DictionaryArray: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [ -- dictionary: + ["Brittle stars"] -- indices: + [0], -- dictionary: + ["Flamingo"] -- indices: + [0], -- dictionary: + ["Dog","Centipede"] -- indices: + [0,1], -- dictionary: + ["Parrot","Horse"] -- indices: + [0,1]] + + Read the table with filter: + + >>> pq.read_table( + ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] + ... ).to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + + Read data from a single Parquet file: + + >>> pq.write_table(table, "example.parquet") + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + """ + +def read_pandas( + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs +) -> Table: + """ + + Read a Table from Parquet format, also reading DataFrame + index values if known in the file metadata + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + **kwargs + additional options for :func:`read_table` + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns + """ + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: + """ + + Write a Table to Parquet format. + + Parameters + ---------- + table : pyarrow.Table + where : string or pyarrow.NativeFile + row_group_size : int + Maximum number of rows in each written row group. If None, the + row group size will be the minimum of the Table size and + 1024 * 1024. + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + + **kwargs : optional + Additional options for ParquetWriter + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write the Table into Parquet file: + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Defining row group size for the Parquet file: + + >>> pq.write_table(table, "example.parquet", row_group_size=3) + + Defining row group compression (default is Snappy): + + >>> pq.write_table(table, "example.parquet", compression="none") + + Defining row group compression and encoding per-column: + + >>> pq.write_table( + ... table, + ... "example.parquet", + ... compression={"n_legs": "snappy", "animal": "gzip"}, + ... use_dictionary=["n_legs", "animal"], + ... ) + + Defining column encoding per-column: + + >>> pq.write_table( + ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False + ... ) + """ + +def write_to_dataset( + table: Table, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: + """ + Wrapper around dataset.write_dataset for writing a Table to + Parquet format by partitions. + For each combination of partition columns and values, + a subdirectories are created in the following + manner: + + root_dir/ + group1=value1 + group2=value1 + .parquet + group2=value2 + .parquet + group1=valueN + group2=value1 + .parquet + group2=valueN + .parquet + + Parameters + ---------- + table : pyarrow.Table + root_path : str, pathlib.Path + The root directory of the dataset. + partition_cols : list, + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : Schema, optional + This Schema of the dataset. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the + ``pyarrow.dataset.partitioning()`` function or a list of field names. + When providing a list of field names, you can use + ``partitioning_flavor`` to drive which partitioning type should be + used. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to "guid-{i}.parquet". + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + + existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behaviour is 'overwrite_or_ignore'. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'error' will raise an error if any data exists in the destination. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + **kwargs : dict, + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. + Using `metadata_collector` in kwargs allows one to collect the + file metadata instances of dataset pieces. The file paths in the + ColumnChunkMetaData will be set relative to `root_path`. + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write it to a partitioned dataset: + + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) + >>> pq.ParquetDataset("dataset_name_3").files + ['dataset_name_3/year=2019/...-0.parquet', ... + + Write a single Parquet file into the root folder: + + >>> pq.write_to_dataset(table, root_path="dataset_name_4") + >>> pq.ParquetDataset("dataset_name_4/").files + ['dataset_name_4/...-0.parquet'] + """ + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: + """ + Write metadata-only Parquet file from schema. This can be used with + `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar + files. + + Parameters + ---------- + schema : pyarrow.Schema + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + **kwargs : dict, + Additional kwargs for ParquetWriter class. See docstring for + `ParquetWriter` for more information. + + Examples + -------- + Generate example data: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Write a dataset and collect metadata information. + + >>> metadata_collector = [] + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) + + Write the `_common_metadata` parquet file without row groups statistics. + + >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") + + Write the `_metadata` parquet file with row groups statistics. + + >>> pq.write_metadata( + ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector + ... ) + """ + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> FileMetaData: + """ + Read FileMetaData from footer of a single Parquet file. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + metadata : FileMetaData + The metadata of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_metadata("example.parquet") + + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + """ + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Schema: + """ + Read effective Arrow schema from Parquet file metadata. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + schema : pyarrow.Schema + The schema of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_schema("example.parquet") + n_legs: int64 + animal: string + """ diff --git a/python/stubs/parquet/encryption.pyi b/python/stubs/parquet/encryption.pyi new file mode 100644 index 00000000000..5a77dae7ef7 --- /dev/null +++ b/python/stubs/parquet/encryption.pyi @@ -0,0 +1,15 @@ +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/stubs/substrait.pyi b/python/stubs/substrait.pyi new file mode 100644 index 00000000000..a56a8a5b40f --- /dev/null +++ b/python/stubs/substrait.pyi @@ -0,0 +1,21 @@ +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/stubs/types.pyi b/python/stubs/types.pyi new file mode 100644 index 00000000000..0cb4f6171d3 --- /dev/null +++ b/python/stubs/types.pyi @@ -0,0 +1,194 @@ +import sys + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + Uint32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] diff --git a/python/stubs/util.pyi b/python/stubs/util.pyi new file mode 100644 index 00000000000..c2ecf7d6b61 --- /dev/null +++ b/python/stubs/util.pyi @@ -0,0 +1,27 @@ +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... From f25ff2b0e13f323f859bb63de445553a8781076d Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 10:00:52 +0200 Subject: [PATCH 218/231] Add `ty` configuration and suppress error codes --- python/pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyproject.toml b/python/pyproject.toml index b573b8843e7..f839c9a1e9f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -97,3 +97,9 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '21.0.0a0' + +[tool.ty] +environment.root = [ + "./pyarrow", +] +rules = {call-non-callable = "ignore", invalid-argument-type = "ignore", invalid-assignment = "ignore", invalid-context-manager = "ignore", invalid-ignore-comment = "ignore", invalid-return-type = "ignore", invalid-type-form = "ignore", missing-argument = "ignore", possibly-unbound-import = "ignore", no-matching-overload = "ignore", non-subscriptable = "ignore", not-iterable = "ignore", possibly-unbound-attribute = "ignore", too-many-positional-arguments = "ignore", unknown-argument = "ignore", unresolved-attribute = "ignore", unresolved-global = "ignore", unresolved-import = "ignore", unresolved-reference = "ignore", unsupported-operator = "ignore"} From 00516a2d0b5776e1b0d4dc29df57bb2daf808c1e Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 11:40:11 +0200 Subject: [PATCH 219/231] One line per rule --- python/pyproject.toml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index f839c9a1e9f..49ff3ae3f86 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -102,4 +102,23 @@ fallback_version = '21.0.0a0' environment.root = [ "./pyarrow", ] -rules = {call-non-callable = "ignore", invalid-argument-type = "ignore", invalid-assignment = "ignore", invalid-context-manager = "ignore", invalid-ignore-comment = "ignore", invalid-return-type = "ignore", invalid-type-form = "ignore", missing-argument = "ignore", possibly-unbound-import = "ignore", no-matching-overload = "ignore", non-subscriptable = "ignore", not-iterable = "ignore", possibly-unbound-attribute = "ignore", too-many-positional-arguments = "ignore", unknown-argument = "ignore", unresolved-attribute = "ignore", unresolved-global = "ignore", unresolved-import = "ignore", unresolved-reference = "ignore", unsupported-operator = "ignore"} +rules.call-non-callable = "ignore" +rules.invalid-argument-type = "ignore" +rules.invalid-assignment = "ignore" +rules.invalid-context-manager = "ignore" +rules.invalid-ignore-comment = "ignore" +rules.invalid-return-type = "ignore" +rules.invalid-type-form = "ignore" +rules.missing-argument = "ignore" +rules.no-matching-overload = "ignore" +rules.non-subscriptable = "ignore" +rules.not-iterable = "ignore" +rules.possibly-unbound-attribute = "ignore" +rules.possibly-unbound-import = "ignore" +rules.too-many-positional-arguments = "ignore" +rules.unknown-argument = "ignore" +rules.unresolved-attribute = "ignore" +rules.unresolved-global = "ignore" +rules.unresolved-import = "ignore" +rules.unresolved-reference = "ignore" +rules.unsupported-operator = "ignore" From 1631f3916479ce9e1fd7df1194f61cb420962fd5 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 12:17:06 +0200 Subject: [PATCH 220/231] Add licence header from original repo for all `.pyi` files --- python/stubs/__init__.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/array.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/benchmark.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/builder.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/compat.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/config.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/device.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/error.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/io.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/ipc.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/memory.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/pandas_shim.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/scalar.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/table.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/tensor.pyi | 25 ++++++++++++++++++ python/stubs/__lib_pxi/types.pyi | 25 ++++++++++++++++++ python/stubs/_azurefs.pyi | 25 ++++++++++++++++++ python/stubs/_compute.pyi | 25 ++++++++++++++++++ python/stubs/_csv.pyi | 25 ++++++++++++++++++ python/stubs/_cuda.pyi | 25 ++++++++++++++++++ python/stubs/_dataset.pyi | 25 ++++++++++++++++++ python/stubs/_dataset_orc.pyi | 25 ++++++++++++++++++ python/stubs/_dataset_parquet.pyi | 25 ++++++++++++++++++ python/stubs/_dataset_parquet_encryption.pyi | 25 ++++++++++++++++++ python/stubs/_feather.pyi | 25 ++++++++++++++++++ python/stubs/_flight.pyi | 25 ++++++++++++++++++ python/stubs/_fs.pyi | 25 ++++++++++++++++++ python/stubs/_gcsfs.pyi | 25 ++++++++++++++++++ python/stubs/_hdfs.pyi | 25 ++++++++++++++++++ python/stubs/_json.pyi | 25 ++++++++++++++++++ python/stubs/_orc.pyi | 25 ++++++++++++++++++ python/stubs/_parquet.pyi | 25 ++++++++++++++++++ python/stubs/_parquet_encryption.pyi | 25 ++++++++++++++++++ python/stubs/_s3fs.pyi | 25 ++++++++++++++++++ python/stubs/_stubs_typing.pyi | 25 ++++++++++++++++++ python/stubs/_substrait.pyi | 25 ++++++++++++++++++ python/stubs/acero.pyi | 25 ++++++++++++++++++ python/stubs/benchmark.pyi | 25 ++++++++++++++++++ python/stubs/cffi.pyi | 25 ++++++++++++++++++ python/stubs/compute.pyi | 25 ++++++++++++++++++ python/stubs/csv.pyi | 25 ++++++++++++++++++ python/stubs/cuda.pyi | 25 ++++++++++++++++++ python/stubs/dataset.pyi | 25 ++++++++++++++++++ python/stubs/feather.pyi | 25 ++++++++++++++++++ python/stubs/flight.pyi | 25 ++++++++++++++++++ python/stubs/fs.pyi | 25 ++++++++++++++++++ python/stubs/gandiva.pyi | 25 ++++++++++++++++++ python/stubs/interchange/buffer.pyi | 25 ++++++++++++++++++ python/stubs/interchange/column.pyi | 25 ++++++++++++++++++ python/stubs/interchange/dataframe.pyi | 25 ++++++++++++++++++ python/stubs/interchange/from_dataframe.pyi | 25 ++++++++++++++++++ python/stubs/ipc.pyi | 25 ++++++++++++++++++ python/stubs/json.pyi | 25 ++++++++++++++++++ python/stubs/lib.pyi | 25 ++++++++++++++++++ python/stubs/orc.pyi | 25 ++++++++++++++++++ python/stubs/pandas_compat.pyi | 25 ++++++++++++++++++ python/stubs/parquet/__init__.pyi | 27 +++++++++++++++++++- python/stubs/parquet/core.pyi | 25 ++++++++++++++++++ python/stubs/parquet/encryption.pyi | 25 ++++++++++++++++++ python/stubs/substrait.pyi | 25 ++++++++++++++++++ python/stubs/types.pyi | 25 ++++++++++++++++++ python/stubs/util.pyi | 25 ++++++++++++++++++ 62 files changed, 1551 insertions(+), 1 deletion(-) diff --git a/python/stubs/__init__.pyi b/python/stubs/__init__.pyi index 8a0d1e870c5..6567e3221c4 100644 --- a/python/stubs/__init__.pyi +++ b/python/stubs/__init__.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # ruff: noqa: F401, I001, E402 __version__: str diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index ec1cda30a88..030ece2ab75 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/benchmark.pyi b/python/stubs/__lib_pxi/benchmark.pyi index 66981bf0f51..0d0f88cc201 100644 --- a/python/stubs/__lib_pxi/benchmark.pyi +++ b/python/stubs/__lib_pxi/benchmark.pyi @@ -1 +1,26 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/stubs/__lib_pxi/builder.pyi b/python/stubs/__lib_pxi/builder.pyi index 4a0e9ca4708..7a2a9d24827 100644 --- a/python/stubs/__lib_pxi/builder.pyi +++ b/python/stubs/__lib_pxi/builder.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Iterable from pyarrow.lib import MemoryPool, _Weakrefable diff --git a/python/stubs/__lib_pxi/compat.pyi b/python/stubs/__lib_pxi/compat.pyi index ae667be453e..0011c1507cb 100644 --- a/python/stubs/__lib_pxi/compat.pyi +++ b/python/stubs/__lib_pxi/compat.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + def encode_file_path(path: str | bytes) -> bytes: ... def tobytes(o: str | bytes) -> bytes: ... def frombytes(o: bytes, *, safe: bool = False): ... diff --git a/python/stubs/__lib_pxi/config.pyi b/python/stubs/__lib_pxi/config.pyi index 166e10c9734..aecf0088e4e 100644 --- a/python/stubs/__lib_pxi/config.pyi +++ b/python/stubs/__lib_pxi/config.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import NamedTuple class VersionInfo(NamedTuple): diff --git a/python/stubs/__lib_pxi/device.pyi b/python/stubs/__lib_pxi/device.pyi index d1b9f39eedd..9dd8d889476 100644 --- a/python/stubs/__lib_pxi/device.pyi +++ b/python/stubs/__lib_pxi/device.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import enum from pyarrow.lib import _Weakrefable diff --git a/python/stubs/__lib_pxi/error.pyi b/python/stubs/__lib_pxi/error.pyi index 981ed51e680..6e3fca3c5aa 100644 --- a/python/stubs/__lib_pxi/error.pyi +++ b/python/stubs/__lib_pxi/error.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi index d882fd79d57..a35d3b0c7c2 100644 --- a/python/stubs/__lib_pxi/io.pyi +++ b/python/stubs/__lib_pxi/io.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys from collections.abc import Callable diff --git a/python/stubs/__lib_pxi/ipc.pyi b/python/stubs/__lib_pxi/ipc.pyi index 3d72892061e..aa071e266d0 100644 --- a/python/stubs/__lib_pxi/ipc.pyi +++ b/python/stubs/__lib_pxi/ipc.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import enum import sys diff --git a/python/stubs/__lib_pxi/memory.pyi b/python/stubs/__lib_pxi/memory.pyi index 57a3bb4f1b3..a4fdaa66136 100644 --- a/python/stubs/__lib_pxi/memory.pyi +++ b/python/stubs/__lib_pxi/memory.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi index 0e80fae4ebf..cb7f2a590a4 100644 --- a/python/stubs/__lib_pxi/pandas_shim.pyi +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from types import ModuleType from typing import Any, Iterable, TypeGuard diff --git a/python/stubs/__lib_pxi/scalar.pyi b/python/stubs/__lib_pxi/scalar.pyi index 81ab5012067..ce0e6edccaf 100644 --- a/python/stubs/__lib_pxi/scalar.pyi +++ b/python/stubs/__lib_pxi/scalar.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import collections.abc import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index ad9d0392137..4bf090f1e8a 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/tensor.pyi b/python/stubs/__lib_pxi/tensor.pyi index d849abd0f1f..e6883a0dfcd 100644 --- a/python/stubs/__lib_pxi/tensor.pyi +++ b/python/stubs/__lib_pxi/tensor.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index 7fe6c36e332..f22c03faa4c 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt import sys diff --git a/python/stubs/_azurefs.pyi b/python/stubs/_azurefs.pyi index 317943ce20f..79bb7a2e8cd 100644 --- a/python/stubs/_azurefs.pyi +++ b/python/stubs/_azurefs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Literal from ._fs import FileSystem diff --git a/python/stubs/_compute.pyi b/python/stubs/_compute.pyi index 3d61ae42787..4c2cb434f84 100644 --- a/python/stubs/_compute.pyi +++ b/python/stubs/_compute.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import ( Any, Callable, diff --git a/python/stubs/_csv.pyi b/python/stubs/_csv.pyi index 2f49f8c9a6c..7fc06a62a3d 100644 --- a/python/stubs/_csv.pyi +++ b/python/stubs/_csv.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from dataclasses import dataclass, field from typing import IO, Any, Callable, Literal diff --git a/python/stubs/_cuda.pyi b/python/stubs/_cuda.pyi index ad52b2f380f..44ef7d13ff1 100644 --- a/python/stubs/_cuda.pyi +++ b/python/stubs/_cuda.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Any import cuda # type: ignore[import-not-found] diff --git a/python/stubs/_dataset.pyi b/python/stubs/_dataset.pyi index af864f9154b..deddde37086 100644 --- a/python/stubs/_dataset.pyi +++ b/python/stubs/_dataset.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/_dataset_orc.pyi b/python/stubs/_dataset_orc.pyi index 9c4ac04198f..453779cd15d 100644 --- a/python/stubs/_dataset_orc.pyi +++ b/python/stubs/_dataset_orc.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from ._dataset import FileFormat class OrcFileFormat(FileFormat): diff --git a/python/stubs/_dataset_parquet.pyi b/python/stubs/_dataset_parquet.pyi index cbcc17235f1..64f3ae0a5b0 100644 --- a/python/stubs/_dataset_parquet.pyi +++ b/python/stubs/_dataset_parquet.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from dataclasses import dataclass from typing import IO, Any, Iterable, TypedDict diff --git a/python/stubs/_dataset_parquet_encryption.pyi b/python/stubs/_dataset_parquet_encryption.pyi index 7623275b865..c2bd650db61 100644 --- a/python/stubs/_dataset_parquet_encryption.pyi +++ b/python/stubs/_dataset_parquet_encryption.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions from ._parquet import FileDecryptionProperties from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig diff --git a/python/stubs/_feather.pyi b/python/stubs/_feather.pyi index 8bb914ba45d..2ee7db77e45 100644 --- a/python/stubs/_feather.pyi +++ b/python/stubs/_feather.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import IO from _typeshed import StrPath diff --git a/python/stubs/_flight.pyi b/python/stubs/_flight.pyi index 4450c42df49..6802218e944 100644 --- a/python/stubs/_flight.pyi +++ b/python/stubs/_flight.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import asyncio import enum import sys diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 7670ef5230d..35f04222dd4 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt import enum import sys diff --git a/python/stubs/_gcsfs.pyi b/python/stubs/_gcsfs.pyi index 4fc7ea68e48..c2d554273cb 100644 --- a/python/stubs/_gcsfs.pyi +++ b/python/stubs/_gcsfs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt from ._fs import FileSystem diff --git a/python/stubs/_hdfs.pyi b/python/stubs/_hdfs.pyi index 200f669379b..ec5c9e8b9ad 100644 --- a/python/stubs/_hdfs.pyi +++ b/python/stubs/_hdfs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from _typeshed import StrPath from ._fs import FileSystem diff --git a/python/stubs/_json.pyi b/python/stubs/_json.pyi index 43d2ae83cd8..52dac59b7bb 100644 --- a/python/stubs/_json.pyi +++ b/python/stubs/_json.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import IO, Any, Literal from _typeshed import StrPath diff --git a/python/stubs/_orc.pyi b/python/stubs/_orc.pyi index 71bf0dde9ba..d80e1720d51 100644 --- a/python/stubs/_orc.pyi +++ b/python/stubs/_orc.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import IO, Literal from .lib import ( diff --git a/python/stubs/_parquet.pyi b/python/stubs/_parquet.pyi index a9187df0428..439fdec47b4 100644 --- a/python/stubs/_parquet.pyi +++ b/python/stubs/_parquet.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict from _typeshed import StrPath diff --git a/python/stubs/_parquet_encryption.pyi b/python/stubs/_parquet_encryption.pyi index c707edb844a..68b3eac87e5 100644 --- a/python/stubs/_parquet_encryption.pyi +++ b/python/stubs/_parquet_encryption.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt from typing import Callable diff --git a/python/stubs/_s3fs.pyi b/python/stubs/_s3fs.pyi index fc13c498bd9..64f6f37ab75 100644 --- a/python/stubs/_s3fs.pyi +++ b/python/stubs/_s3fs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import enum from typing import Literal, NotRequired, Required, TypedDict diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi index c259513f1ea..980d1aaa4bc 100644 --- a/python/stubs/_stubs_typing.pyi +++ b/python/stubs/_stubs_typing.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import datetime as dt from collections.abc import Sequence diff --git a/python/stubs/_substrait.pyi b/python/stubs/_substrait.pyi index ff226e9521b..309c08c1e07 100644 --- a/python/stubs/_substrait.pyi +++ b/python/stubs/_substrait.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Any, Callable from ._compute import Expression diff --git a/python/stubs/acero.pyi b/python/stubs/acero.pyi index 8a520bdc24a..cd4675e7010 100644 --- a/python/stubs/acero.pyi +++ b/python/stubs/acero.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/benchmark.pyi b/python/stubs/benchmark.pyi index 048973301dc..0e5141f3a19 100644 --- a/python/stubs/benchmark.pyi +++ b/python/stubs/benchmark.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow.lib import benchmark_PandasObjectIsNull __all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/stubs/cffi.pyi b/python/stubs/cffi.pyi index 2ae945c5974..6b437fb5a2f 100644 --- a/python/stubs/cffi.pyi +++ b/python/stubs/cffi.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import cffi c_source: str diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 8d8fc35b134..b7ba840981b 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # ruff: noqa: I001 from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence from collections.abc import Callable diff --git a/python/stubs/csv.pyi b/python/stubs/csv.pyi index 510229d7e72..c6ff540adb5 100644 --- a/python/stubs/csv.pyi +++ b/python/stubs/csv.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._csv import ( ISO8601, ConvertOptions, diff --git a/python/stubs/cuda.pyi b/python/stubs/cuda.pyi index e11baf7d4e7..491f9d76581 100644 --- a/python/stubs/cuda.pyi +++ b/python/stubs/cuda.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._cuda import ( BufferReader, BufferWriter, diff --git a/python/stubs/dataset.pyi b/python/stubs/dataset.pyi index 98f1a38aa85..b3002695e8c 100644 --- a/python/stubs/dataset.pyi +++ b/python/stubs/dataset.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload from _typeshed import StrPath diff --git a/python/stubs/feather.pyi b/python/stubs/feather.pyi index 9451ee15763..1e2bcb77ca5 100644 --- a/python/stubs/feather.pyi +++ b/python/stubs/feather.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import IO, Literal import pandas as pd diff --git a/python/stubs/flight.pyi b/python/stubs/flight.pyi index 9b806ccf305..90c76127b83 100644 --- a/python/stubs/flight.pyi +++ b/python/stubs/flight.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._flight import ( Action, ActionType, diff --git a/python/stubs/fs.pyi b/python/stubs/fs.pyi index 6bf75616c13..34788112092 100644 --- a/python/stubs/fs.pyi +++ b/python/stubs/fs.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._fs import ( # noqa FileSelector, FileType, diff --git a/python/stubs/gandiva.pyi b/python/stubs/gandiva.pyi index a344f885b29..d4f0cdffedc 100644 --- a/python/stubs/gandiva.pyi +++ b/python/stubs/gandiva.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Iterable, Literal from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable diff --git a/python/stubs/interchange/buffer.pyi b/python/stubs/interchange/buffer.pyi index 46673961a75..ecc4abe5e8c 100644 --- a/python/stubs/interchange/buffer.pyi +++ b/python/stubs/interchange/buffer.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import enum from pyarrow.lib import Buffer diff --git a/python/stubs/interchange/column.pyi b/python/stubs/interchange/column.pyi index e6662867b6b..f34d6c418b2 100644 --- a/python/stubs/interchange/column.pyi +++ b/python/stubs/interchange/column.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import enum from typing import Any, Iterable, TypeAlias, TypedDict diff --git a/python/stubs/interchange/dataframe.pyi b/python/stubs/interchange/dataframe.pyi index 526a58926a9..f857ee62f09 100644 --- a/python/stubs/interchange/dataframe.pyi +++ b/python/stubs/interchange/dataframe.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/interchange/from_dataframe.pyi b/python/stubs/interchange/from_dataframe.pyi index b04b6268975..900b2246e3c 100644 --- a/python/stubs/interchange/from_dataframe.pyi +++ b/python/stubs/interchange/from_dataframe.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Any, Protocol, TypeAlias from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table diff --git a/python/stubs/ipc.pyi b/python/stubs/ipc.pyi index c7f2af004d4..b4c0bf5220f 100644 --- a/python/stubs/ipc.pyi +++ b/python/stubs/ipc.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from io import IOBase import pandas as pd diff --git a/python/stubs/json.pyi b/python/stubs/json.pyi index db1d35e0b8b..3545c1e00ee 100644 --- a/python/stubs/json.pyi +++ b/python/stubs/json.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/stubs/lib.pyi b/python/stubs/lib.pyi index 1698b55520b..a00c434ea22 100644 --- a/python/stubs/lib.pyi +++ b/python/stubs/lib.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # ruff: noqa: F403 from typing import NamedTuple diff --git a/python/stubs/orc.pyi b/python/stubs/orc.pyi index 2eba8d40a11..f2659d2a12c 100644 --- a/python/stubs/orc.pyi +++ b/python/stubs/orc.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/pandas_compat.pyi b/python/stubs/pandas_compat.pyi index efbd05ac2fe..e9d7e350d88 100644 --- a/python/stubs/pandas_compat.pyi +++ b/python/stubs/pandas_compat.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import Any, TypedDict, TypeVar import numpy as np diff --git a/python/stubs/parquet/__init__.pyi b/python/stubs/parquet/__init__.pyi index 4ef88705809..9de099c030d 100644 --- a/python/stubs/parquet/__init__.pyi +++ b/python/stubs/parquet/__init__.pyi @@ -1 +1,26 @@ -from .core import * # noqa +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .core import * # noqa diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi index 56b2c8447d9..403b139d606 100644 --- a/python/stubs/parquet/core.pyi +++ b/python/stubs/parquet/core.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys from pathlib import Path diff --git a/python/stubs/parquet/encryption.pyi b/python/stubs/parquet/encryption.pyi index 5a77dae7ef7..562a4905edc 100644 --- a/python/stubs/parquet/encryption.pyi +++ b/python/stubs/parquet/encryption.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._parquet_encryption import ( CryptoFactory, DecryptionConfiguration, diff --git a/python/stubs/substrait.pyi b/python/stubs/substrait.pyi index a56a8a5b40f..6903cdce914 100644 --- a/python/stubs/substrait.pyi +++ b/python/stubs/substrait.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pyarrow._substrait import ( BoundExpressions, SubstraitSchema, diff --git a/python/stubs/types.pyi b/python/stubs/types.pyi index 0cb4f6171d3..e3e840cfe8c 100644 --- a/python/stubs/types.pyi +++ b/python/stubs/types.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import sys from typing import Any diff --git a/python/stubs/util.pyi b/python/stubs/util.pyi index c2ecf7d6b61..b87daca0b54 100644 --- a/python/stubs/util.pyi +++ b/python/stubs/util.pyi @@ -1,3 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2024, ZhengYu, Xu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from collections.abc import Callable from os import PathLike from typing import Any, Protocol, Sequence, TypeVar From 6d3d972ef0bea885d6d0c1396f0b1ece0bece000 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 12:27:47 +0200 Subject: [PATCH 221/231] Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. --- python/stubs/__init__.pyi | 25 ------------------ python/stubs/__lib_pxi/array.pyi | 25 ------------------ python/stubs/__lib_pxi/benchmark.pyi | 25 ------------------ python/stubs/__lib_pxi/builder.pyi | 25 ------------------ python/stubs/__lib_pxi/compat.pyi | 25 ------------------ python/stubs/__lib_pxi/config.pyi | 25 ------------------ python/stubs/__lib_pxi/device.pyi | 25 ------------------ python/stubs/__lib_pxi/error.pyi | 25 ------------------ python/stubs/__lib_pxi/io.pyi | 25 ------------------ python/stubs/__lib_pxi/ipc.pyi | 25 ------------------ python/stubs/__lib_pxi/memory.pyi | 25 ------------------ python/stubs/__lib_pxi/pandas_shim.pyi | 25 ------------------ python/stubs/__lib_pxi/scalar.pyi | 25 ------------------ python/stubs/__lib_pxi/table.pyi | 25 ------------------ python/stubs/__lib_pxi/tensor.pyi | 25 ------------------ python/stubs/__lib_pxi/types.pyi | 25 ------------------ python/stubs/_azurefs.pyi | 25 ------------------ python/stubs/_compute.pyi | 25 ------------------ python/stubs/_csv.pyi | 25 ------------------ python/stubs/_cuda.pyi | 25 ------------------ python/stubs/_dataset.pyi | 25 ------------------ python/stubs/_dataset_orc.pyi | 25 ------------------ python/stubs/_dataset_parquet.pyi | 25 ------------------ python/stubs/_dataset_parquet_encryption.pyi | 25 ------------------ python/stubs/_feather.pyi | 25 ------------------ python/stubs/_flight.pyi | 25 ------------------ python/stubs/_fs.pyi | 25 ------------------ python/stubs/_gcsfs.pyi | 25 ------------------ python/stubs/_hdfs.pyi | 25 ------------------ python/stubs/_json.pyi | 25 ------------------ python/stubs/_orc.pyi | 25 ------------------ python/stubs/_parquet.pyi | 25 ------------------ python/stubs/_parquet_encryption.pyi | 25 ------------------ python/stubs/_s3fs.pyi | 25 ------------------ python/stubs/_stubs_typing.pyi | 25 ------------------ python/stubs/_substrait.pyi | 25 ------------------ python/stubs/acero.pyi | 25 ------------------ python/stubs/benchmark.pyi | 25 ------------------ python/stubs/cffi.pyi | 25 ------------------ python/stubs/compute.pyi | 25 ------------------ python/stubs/csv.pyi | 25 ------------------ python/stubs/cuda.pyi | 25 ------------------ python/stubs/dataset.pyi | 25 ------------------ python/stubs/feather.pyi | 25 ------------------ python/stubs/flight.pyi | 25 ------------------ python/stubs/fs.pyi | 25 ------------------ python/stubs/gandiva.pyi | 25 ------------------ python/stubs/interchange/buffer.pyi | 25 ------------------ python/stubs/interchange/column.pyi | 25 ------------------ python/stubs/interchange/dataframe.pyi | 25 ------------------ python/stubs/interchange/from_dataframe.pyi | 25 ------------------ python/stubs/ipc.pyi | 25 ------------------ python/stubs/json.pyi | 25 ------------------ python/stubs/lib.pyi | 25 ------------------ python/stubs/orc.pyi | 25 ------------------ python/stubs/pandas_compat.pyi | 25 ------------------ python/stubs/parquet/__init__.pyi | 27 +------------------- python/stubs/parquet/core.pyi | 25 ------------------ python/stubs/parquet/encryption.pyi | 25 ------------------ python/stubs/substrait.pyi | 25 ------------------ python/stubs/types.pyi | 25 ------------------ python/stubs/util.pyi | 25 ------------------ 62 files changed, 1 insertion(+), 1551 deletions(-) diff --git a/python/stubs/__init__.pyi b/python/stubs/__init__.pyi index 6567e3221c4..8a0d1e870c5 100644 --- a/python/stubs/__init__.pyi +++ b/python/stubs/__init__.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # ruff: noqa: F401, I001, E402 __version__: str diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index 030ece2ab75..ec1cda30a88 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/benchmark.pyi b/python/stubs/__lib_pxi/benchmark.pyi index 0d0f88cc201..66981bf0f51 100644 --- a/python/stubs/__lib_pxi/benchmark.pyi +++ b/python/stubs/__lib_pxi/benchmark.pyi @@ -1,26 +1 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/stubs/__lib_pxi/builder.pyi b/python/stubs/__lib_pxi/builder.pyi index 7a2a9d24827..4a0e9ca4708 100644 --- a/python/stubs/__lib_pxi/builder.pyi +++ b/python/stubs/__lib_pxi/builder.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Iterable from pyarrow.lib import MemoryPool, _Weakrefable diff --git a/python/stubs/__lib_pxi/compat.pyi b/python/stubs/__lib_pxi/compat.pyi index 0011c1507cb..ae667be453e 100644 --- a/python/stubs/__lib_pxi/compat.pyi +++ b/python/stubs/__lib_pxi/compat.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - def encode_file_path(path: str | bytes) -> bytes: ... def tobytes(o: str | bytes) -> bytes: ... def frombytes(o: bytes, *, safe: bool = False): ... diff --git a/python/stubs/__lib_pxi/config.pyi b/python/stubs/__lib_pxi/config.pyi index aecf0088e4e..166e10c9734 100644 --- a/python/stubs/__lib_pxi/config.pyi +++ b/python/stubs/__lib_pxi/config.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import NamedTuple class VersionInfo(NamedTuple): diff --git a/python/stubs/__lib_pxi/device.pyi b/python/stubs/__lib_pxi/device.pyi index 9dd8d889476..d1b9f39eedd 100644 --- a/python/stubs/__lib_pxi/device.pyi +++ b/python/stubs/__lib_pxi/device.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import enum from pyarrow.lib import _Weakrefable diff --git a/python/stubs/__lib_pxi/error.pyi b/python/stubs/__lib_pxi/error.pyi index 6e3fca3c5aa..981ed51e680 100644 --- a/python/stubs/__lib_pxi/error.pyi +++ b/python/stubs/__lib_pxi/error.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi index a35d3b0c7c2..d882fd79d57 100644 --- a/python/stubs/__lib_pxi/io.pyi +++ b/python/stubs/__lib_pxi/io.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys from collections.abc import Callable diff --git a/python/stubs/__lib_pxi/ipc.pyi b/python/stubs/__lib_pxi/ipc.pyi index aa071e266d0..3d72892061e 100644 --- a/python/stubs/__lib_pxi/ipc.pyi +++ b/python/stubs/__lib_pxi/ipc.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import enum import sys diff --git a/python/stubs/__lib_pxi/memory.pyi b/python/stubs/__lib_pxi/memory.pyi index a4fdaa66136..57a3bb4f1b3 100644 --- a/python/stubs/__lib_pxi/memory.pyi +++ b/python/stubs/__lib_pxi/memory.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow.lib import _Weakrefable class MemoryPool(_Weakrefable): diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi index cb7f2a590a4..0e80fae4ebf 100644 --- a/python/stubs/__lib_pxi/pandas_shim.pyi +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from types import ModuleType from typing import Any, Iterable, TypeGuard diff --git a/python/stubs/__lib_pxi/scalar.pyi b/python/stubs/__lib_pxi/scalar.pyi index ce0e6edccaf..81ab5012067 100644 --- a/python/stubs/__lib_pxi/scalar.pyi +++ b/python/stubs/__lib_pxi/scalar.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import collections.abc import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index 4bf090f1e8a..ad9d0392137 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt import sys diff --git a/python/stubs/__lib_pxi/tensor.pyi b/python/stubs/__lib_pxi/tensor.pyi index e6883a0dfcd..d849abd0f1f 100644 --- a/python/stubs/__lib_pxi/tensor.pyi +++ b/python/stubs/__lib_pxi/tensor.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index f22c03faa4c..7fe6c36e332 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt import sys diff --git a/python/stubs/_azurefs.pyi b/python/stubs/_azurefs.pyi index 79bb7a2e8cd..317943ce20f 100644 --- a/python/stubs/_azurefs.pyi +++ b/python/stubs/_azurefs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Literal from ._fs import FileSystem diff --git a/python/stubs/_compute.pyi b/python/stubs/_compute.pyi index 4c2cb434f84..3d61ae42787 100644 --- a/python/stubs/_compute.pyi +++ b/python/stubs/_compute.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import ( Any, Callable, diff --git a/python/stubs/_csv.pyi b/python/stubs/_csv.pyi index 7fc06a62a3d..2f49f8c9a6c 100644 --- a/python/stubs/_csv.pyi +++ b/python/stubs/_csv.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from dataclasses import dataclass, field from typing import IO, Any, Callable, Literal diff --git a/python/stubs/_cuda.pyi b/python/stubs/_cuda.pyi index 44ef7d13ff1..ad52b2f380f 100644 --- a/python/stubs/_cuda.pyi +++ b/python/stubs/_cuda.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Any import cuda # type: ignore[import-not-found] diff --git a/python/stubs/_dataset.pyi b/python/stubs/_dataset.pyi index deddde37086..af864f9154b 100644 --- a/python/stubs/_dataset.pyi +++ b/python/stubs/_dataset.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/_dataset_orc.pyi b/python/stubs/_dataset_orc.pyi index 453779cd15d..9c4ac04198f 100644 --- a/python/stubs/_dataset_orc.pyi +++ b/python/stubs/_dataset_orc.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from ._dataset import FileFormat class OrcFileFormat(FileFormat): diff --git a/python/stubs/_dataset_parquet.pyi b/python/stubs/_dataset_parquet.pyi index 64f3ae0a5b0..cbcc17235f1 100644 --- a/python/stubs/_dataset_parquet.pyi +++ b/python/stubs/_dataset_parquet.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from dataclasses import dataclass from typing import IO, Any, Iterable, TypedDict diff --git a/python/stubs/_dataset_parquet_encryption.pyi b/python/stubs/_dataset_parquet_encryption.pyi index c2bd650db61..7623275b865 100644 --- a/python/stubs/_dataset_parquet_encryption.pyi +++ b/python/stubs/_dataset_parquet_encryption.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions from ._parquet import FileDecryptionProperties from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig diff --git a/python/stubs/_feather.pyi b/python/stubs/_feather.pyi index 2ee7db77e45..8bb914ba45d 100644 --- a/python/stubs/_feather.pyi +++ b/python/stubs/_feather.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import IO from _typeshed import StrPath diff --git a/python/stubs/_flight.pyi b/python/stubs/_flight.pyi index 6802218e944..4450c42df49 100644 --- a/python/stubs/_flight.pyi +++ b/python/stubs/_flight.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import asyncio import enum import sys diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 35f04222dd4..7670ef5230d 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt import enum import sys diff --git a/python/stubs/_gcsfs.pyi b/python/stubs/_gcsfs.pyi index c2d554273cb..4fc7ea68e48 100644 --- a/python/stubs/_gcsfs.pyi +++ b/python/stubs/_gcsfs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt from ._fs import FileSystem diff --git a/python/stubs/_hdfs.pyi b/python/stubs/_hdfs.pyi index ec5c9e8b9ad..200f669379b 100644 --- a/python/stubs/_hdfs.pyi +++ b/python/stubs/_hdfs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from _typeshed import StrPath from ._fs import FileSystem diff --git a/python/stubs/_json.pyi b/python/stubs/_json.pyi index 52dac59b7bb..43d2ae83cd8 100644 --- a/python/stubs/_json.pyi +++ b/python/stubs/_json.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import IO, Any, Literal from _typeshed import StrPath diff --git a/python/stubs/_orc.pyi b/python/stubs/_orc.pyi index d80e1720d51..71bf0dde9ba 100644 --- a/python/stubs/_orc.pyi +++ b/python/stubs/_orc.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import IO, Literal from .lib import ( diff --git a/python/stubs/_parquet.pyi b/python/stubs/_parquet.pyi index 439fdec47b4..a9187df0428 100644 --- a/python/stubs/_parquet.pyi +++ b/python/stubs/_parquet.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict from _typeshed import StrPath diff --git a/python/stubs/_parquet_encryption.pyi b/python/stubs/_parquet_encryption.pyi index 68b3eac87e5..c707edb844a 100644 --- a/python/stubs/_parquet_encryption.pyi +++ b/python/stubs/_parquet_encryption.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt from typing import Callable diff --git a/python/stubs/_s3fs.pyi b/python/stubs/_s3fs.pyi index 64f6f37ab75..fc13c498bd9 100644 --- a/python/stubs/_s3fs.pyi +++ b/python/stubs/_s3fs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import enum from typing import Literal, NotRequired, Required, TypedDict diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi index 980d1aaa4bc..c259513f1ea 100644 --- a/python/stubs/_stubs_typing.pyi +++ b/python/stubs/_stubs_typing.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import datetime as dt from collections.abc import Sequence diff --git a/python/stubs/_substrait.pyi b/python/stubs/_substrait.pyi index 309c08c1e07..ff226e9521b 100644 --- a/python/stubs/_substrait.pyi +++ b/python/stubs/_substrait.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Any, Callable from ._compute import Expression diff --git a/python/stubs/acero.pyi b/python/stubs/acero.pyi index cd4675e7010..8a520bdc24a 100644 --- a/python/stubs/acero.pyi +++ b/python/stubs/acero.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/benchmark.pyi b/python/stubs/benchmark.pyi index 0e5141f3a19..048973301dc 100644 --- a/python/stubs/benchmark.pyi +++ b/python/stubs/benchmark.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow.lib import benchmark_PandasObjectIsNull __all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/stubs/cffi.pyi b/python/stubs/cffi.pyi index 6b437fb5a2f..2ae945c5974 100644 --- a/python/stubs/cffi.pyi +++ b/python/stubs/cffi.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import cffi c_source: str diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index b7ba840981b..8d8fc35b134 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # ruff: noqa: I001 from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence from collections.abc import Callable diff --git a/python/stubs/csv.pyi b/python/stubs/csv.pyi index c6ff540adb5..510229d7e72 100644 --- a/python/stubs/csv.pyi +++ b/python/stubs/csv.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._csv import ( ISO8601, ConvertOptions, diff --git a/python/stubs/cuda.pyi b/python/stubs/cuda.pyi index 491f9d76581..e11baf7d4e7 100644 --- a/python/stubs/cuda.pyi +++ b/python/stubs/cuda.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._cuda import ( BufferReader, BufferWriter, diff --git a/python/stubs/dataset.pyi b/python/stubs/dataset.pyi index b3002695e8c..98f1a38aa85 100644 --- a/python/stubs/dataset.pyi +++ b/python/stubs/dataset.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload from _typeshed import StrPath diff --git a/python/stubs/feather.pyi b/python/stubs/feather.pyi index 1e2bcb77ca5..9451ee15763 100644 --- a/python/stubs/feather.pyi +++ b/python/stubs/feather.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import IO, Literal import pandas as pd diff --git a/python/stubs/flight.pyi b/python/stubs/flight.pyi index 90c76127b83..9b806ccf305 100644 --- a/python/stubs/flight.pyi +++ b/python/stubs/flight.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._flight import ( Action, ActionType, diff --git a/python/stubs/fs.pyi b/python/stubs/fs.pyi index 34788112092..6bf75616c13 100644 --- a/python/stubs/fs.pyi +++ b/python/stubs/fs.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._fs import ( # noqa FileSelector, FileType, diff --git a/python/stubs/gandiva.pyi b/python/stubs/gandiva.pyi index d4f0cdffedc..a344f885b29 100644 --- a/python/stubs/gandiva.pyi +++ b/python/stubs/gandiva.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Iterable, Literal from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable diff --git a/python/stubs/interchange/buffer.pyi b/python/stubs/interchange/buffer.pyi index ecc4abe5e8c..46673961a75 100644 --- a/python/stubs/interchange/buffer.pyi +++ b/python/stubs/interchange/buffer.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import enum from pyarrow.lib import Buffer diff --git a/python/stubs/interchange/column.pyi b/python/stubs/interchange/column.pyi index f34d6c418b2..e6662867b6b 100644 --- a/python/stubs/interchange/column.pyi +++ b/python/stubs/interchange/column.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import enum from typing import Any, Iterable, TypeAlias, TypedDict diff --git a/python/stubs/interchange/dataframe.pyi b/python/stubs/interchange/dataframe.pyi index f857ee62f09..526a58926a9 100644 --- a/python/stubs/interchange/dataframe.pyi +++ b/python/stubs/interchange/dataframe.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/interchange/from_dataframe.pyi b/python/stubs/interchange/from_dataframe.pyi index 900b2246e3c..b04b6268975 100644 --- a/python/stubs/interchange/from_dataframe.pyi +++ b/python/stubs/interchange/from_dataframe.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Any, Protocol, TypeAlias from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table diff --git a/python/stubs/ipc.pyi b/python/stubs/ipc.pyi index b4c0bf5220f..c7f2af004d4 100644 --- a/python/stubs/ipc.pyi +++ b/python/stubs/ipc.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from io import IOBase import pandas as pd diff --git a/python/stubs/json.pyi b/python/stubs/json.pyi index 3545c1e00ee..db1d35e0b8b 100644 --- a/python/stubs/json.pyi +++ b/python/stubs/json.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/stubs/lib.pyi b/python/stubs/lib.pyi index a00c434ea22..1698b55520b 100644 --- a/python/stubs/lib.pyi +++ b/python/stubs/lib.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # ruff: noqa: F403 from typing import NamedTuple diff --git a/python/stubs/orc.pyi b/python/stubs/orc.pyi index f2659d2a12c..2eba8d40a11 100644 --- a/python/stubs/orc.pyi +++ b/python/stubs/orc.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys if sys.version_info >= (3, 11): diff --git a/python/stubs/pandas_compat.pyi b/python/stubs/pandas_compat.pyi index e9d7e350d88..efbd05ac2fe 100644 --- a/python/stubs/pandas_compat.pyi +++ b/python/stubs/pandas_compat.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from typing import Any, TypedDict, TypeVar import numpy as np diff --git a/python/stubs/parquet/__init__.pyi b/python/stubs/parquet/__init__.pyi index 9de099c030d..4ef88705809 100644 --- a/python/stubs/parquet/__init__.pyi +++ b/python/stubs/parquet/__init__.pyi @@ -1,26 +1 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from .core import * # noqa +from .core import * # noqa diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi index 403b139d606..56b2c8447d9 100644 --- a/python/stubs/parquet/core.pyi +++ b/python/stubs/parquet/core.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys from pathlib import Path diff --git a/python/stubs/parquet/encryption.pyi b/python/stubs/parquet/encryption.pyi index 562a4905edc..5a77dae7ef7 100644 --- a/python/stubs/parquet/encryption.pyi +++ b/python/stubs/parquet/encryption.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._parquet_encryption import ( CryptoFactory, DecryptionConfiguration, diff --git a/python/stubs/substrait.pyi b/python/stubs/substrait.pyi index 6903cdce914..a56a8a5b40f 100644 --- a/python/stubs/substrait.pyi +++ b/python/stubs/substrait.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from pyarrow._substrait import ( BoundExpressions, SubstraitSchema, diff --git a/python/stubs/types.pyi b/python/stubs/types.pyi index e3e840cfe8c..0cb4f6171d3 100644 --- a/python/stubs/types.pyi +++ b/python/stubs/types.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import sys from typing import Any diff --git a/python/stubs/util.pyi b/python/stubs/util.pyi index b87daca0b54..c2ecf7d6b61 100644 --- a/python/stubs/util.pyi +++ b/python/stubs/util.pyi @@ -1,28 +1,3 @@ -# BSD 2-Clause License -# -# Copyright (c) 2024, ZhengYu, Xu -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - from collections.abc import Callable from os import PathLike from typing import Any, Protocol, Sequence, TypeVar From b919150657fc724618551ae2f67e8afd6691a1ac Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 12:34:32 +0200 Subject: [PATCH 222/231] Prepare for licence merging --- .github/FUNDING.yml | 3 - .github/workflows/lint.yaml | 37 - .github/workflows/release.yaml | 26 - .gitignore | 138 - .pre-commit-config.yaml | 33 - CODE_OF_CONDUCT.md | 128 - README.md | 0 pixi.lock | 1870 ---- pyarrow-stubs/__init__.pyi | 656 -- pyarrow-stubs/__lib_pxi/__init__.pyi | 0 pyarrow-stubs/__lib_pxi/array.pyi | 4274 --------- pyarrow-stubs/__lib_pxi/benchmark.pyi | 1 - pyarrow-stubs/__lib_pxi/builder.pyi | 89 - pyarrow-stubs/__lib_pxi/compat.pyi | 5 - pyarrow-stubs/__lib_pxi/config.pyi | 41 - pyarrow-stubs/__lib_pxi/device.pyi | 88 - pyarrow-stubs/__lib_pxi/error.pyi | 53 - pyarrow-stubs/__lib_pxi/io.pyi | 1474 ---- pyarrow-stubs/__lib_pxi/ipc.pyi | 705 -- pyarrow-stubs/__lib_pxi/memory.pyi | 174 - pyarrow-stubs/__lib_pxi/pandas_shim.pyi | 51 - pyarrow-stubs/__lib_pxi/scalar.pyi | 1017 --- pyarrow-stubs/__lib_pxi/table.pyi | 5609 ------------ pyarrow-stubs/__lib_pxi/tensor.pyi | 688 -- pyarrow-stubs/__lib_pxi/types.pyi | 4413 ---------- pyarrow-stubs/_azurefs.pyi | 74 - pyarrow-stubs/_compute.pyi | 1721 ---- pyarrow-stubs/_csv.pyi | 641 -- pyarrow-stubs/_cuda.pyi | 556 -- pyarrow-stubs/_dataset.pyi | 2299 ----- pyarrow-stubs/_dataset_orc.pyi | 6 - pyarrow-stubs/_dataset_parquet.pyi | 314 - pyarrow-stubs/_dataset_parquet_encryption.pyi | 85 - pyarrow-stubs/_feather.pyi | 29 - pyarrow-stubs/_flight.pyi | 1380 --- pyarrow-stubs/_fs.pyi | 1005 --- pyarrow-stubs/_gcsfs.pyi | 83 - pyarrow-stubs/_hdfs.pyi | 75 - pyarrow-stubs/_json.pyi | 169 - pyarrow-stubs/_orc.pyi | 56 - pyarrow-stubs/_parquet.pyi | 445 - pyarrow-stubs/_parquet_encryption.pyi | 67 - pyarrow-stubs/_s3fs.pyi | 74 - pyarrow-stubs/_stubs_typing.pyi | 80 - pyarrow-stubs/_substrait.pyi | 39 - pyarrow-stubs/acero.pyi | 85 - pyarrow-stubs/benchmark.pyi | 3 - pyarrow-stubs/cffi.pyi | 4 - pyarrow-stubs/compute.pyi | 7779 ----------------- pyarrow-stubs/csv.pyi | 27 - pyarrow-stubs/cuda.pyi | 25 - pyarrow-stubs/dataset.pyi | 229 - pyarrow-stubs/feather.pyi | 50 - pyarrow-stubs/flight.pyi | 95 - pyarrow-stubs/fs.pyi | 77 - pyarrow-stubs/gandiva.pyi | 65 - pyarrow-stubs/interchange/__init__.pyi | 0 pyarrow-stubs/interchange/buffer.pyi | 58 - pyarrow-stubs/interchange/column.pyi | 252 - pyarrow-stubs/interchange/dataframe.pyi | 102 - pyarrow-stubs/interchange/from_dataframe.pyi | 244 - pyarrow-stubs/ipc.pyi | 123 - pyarrow-stubs/json.pyi | 3 - pyarrow-stubs/lib.pyi | 106 - pyarrow-stubs/orc.pyi | 279 - pyarrow-stubs/pandas_compat.pyi | 54 - pyarrow-stubs/parquet/__init__.pyi | 1 - pyarrow-stubs/parquet/core.pyi | 2061 ----- pyarrow-stubs/parquet/encryption.pyi | 15 - pyarrow-stubs/substrait.pyi | 21 - pyarrow-stubs/types.pyi | 194 - pyarrow-stubs/util.pyi | 27 - pyproject.toml | 99 - LICENSE => python/stubs/LICENSE | 0 taplo.toml | 5 - 75 files changed, 42854 deletions(-) delete mode 100644 .github/FUNDING.yml delete mode 100644 .github/workflows/lint.yaml delete mode 100644 .github/workflows/release.yaml delete mode 100644 .gitignore delete mode 100644 .pre-commit-config.yaml delete mode 100644 CODE_OF_CONDUCT.md delete mode 100644 README.md delete mode 100644 pixi.lock delete mode 100644 pyarrow-stubs/__init__.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/__init__.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/array.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/benchmark.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/builder.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/compat.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/config.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/device.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/error.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/io.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/ipc.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/memory.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/pandas_shim.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/scalar.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/table.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/tensor.pyi delete mode 100644 pyarrow-stubs/__lib_pxi/types.pyi delete mode 100644 pyarrow-stubs/_azurefs.pyi delete mode 100644 pyarrow-stubs/_compute.pyi delete mode 100644 pyarrow-stubs/_csv.pyi delete mode 100644 pyarrow-stubs/_cuda.pyi delete mode 100644 pyarrow-stubs/_dataset.pyi delete mode 100644 pyarrow-stubs/_dataset_orc.pyi delete mode 100644 pyarrow-stubs/_dataset_parquet.pyi delete mode 100644 pyarrow-stubs/_dataset_parquet_encryption.pyi delete mode 100644 pyarrow-stubs/_feather.pyi delete mode 100644 pyarrow-stubs/_flight.pyi delete mode 100644 pyarrow-stubs/_fs.pyi delete mode 100644 pyarrow-stubs/_gcsfs.pyi delete mode 100644 pyarrow-stubs/_hdfs.pyi delete mode 100644 pyarrow-stubs/_json.pyi delete mode 100644 pyarrow-stubs/_orc.pyi delete mode 100644 pyarrow-stubs/_parquet.pyi delete mode 100644 pyarrow-stubs/_parquet_encryption.pyi delete mode 100644 pyarrow-stubs/_s3fs.pyi delete mode 100644 pyarrow-stubs/_stubs_typing.pyi delete mode 100644 pyarrow-stubs/_substrait.pyi delete mode 100644 pyarrow-stubs/acero.pyi delete mode 100644 pyarrow-stubs/benchmark.pyi delete mode 100644 pyarrow-stubs/cffi.pyi delete mode 100644 pyarrow-stubs/compute.pyi delete mode 100644 pyarrow-stubs/csv.pyi delete mode 100644 pyarrow-stubs/cuda.pyi delete mode 100644 pyarrow-stubs/dataset.pyi delete mode 100644 pyarrow-stubs/feather.pyi delete mode 100644 pyarrow-stubs/flight.pyi delete mode 100644 pyarrow-stubs/fs.pyi delete mode 100644 pyarrow-stubs/gandiva.pyi delete mode 100644 pyarrow-stubs/interchange/__init__.pyi delete mode 100644 pyarrow-stubs/interchange/buffer.pyi delete mode 100644 pyarrow-stubs/interchange/column.pyi delete mode 100644 pyarrow-stubs/interchange/dataframe.pyi delete mode 100644 pyarrow-stubs/interchange/from_dataframe.pyi delete mode 100644 pyarrow-stubs/ipc.pyi delete mode 100644 pyarrow-stubs/json.pyi delete mode 100644 pyarrow-stubs/lib.pyi delete mode 100644 pyarrow-stubs/orc.pyi delete mode 100644 pyarrow-stubs/pandas_compat.pyi delete mode 100644 pyarrow-stubs/parquet/__init__.pyi delete mode 100644 pyarrow-stubs/parquet/core.pyi delete mode 100644 pyarrow-stubs/parquet/encryption.pyi delete mode 100644 pyarrow-stubs/substrait.pyi delete mode 100644 pyarrow-stubs/types.pyi delete mode 100644 pyarrow-stubs/util.pyi delete mode 100644 pyproject.toml rename LICENSE => python/stubs/LICENSE (100%) delete mode 100644 taplo.toml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index e78fdc9b020..00000000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,3 +0,0 @@ -# These are supported funding model platforms - -github: [zen-xu] diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml deleted file mode 100644 index 7f437d4532c..00000000000 --- a/.github/workflows/lint.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint - -on: - push: - branches: - - main - pull_request: - types: - - opened - - synchronize - -jobs: - taplo: - name: taplo - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - uses: uncenter/setup-taplo@v1 - with: - version: "0.9.3" - - run: taplo fmt --check - - pyright: - name: pyright - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 - with: - python-version: "3.11" - cache: pip - - run: | - python -m venv .venv - source .venv/bin/activate - pip install pandas numpy scipy sparse - - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH - - uses: jakebailey/pyright-action@v2 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index ec40186f30d..00000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,26 +0,0 @@ -name: Release - -on: - push: - tags: - - "*" - -jobs: - release: - name: "release ${{github.ref_name}}" - runs-on: ubuntu-latest - environment: - name: Release - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Install hatch - run: | - python -m pip install hatch - - name: Build dist - run: hatch build - - name: Publish on PyPI - run: hatch publish -u __token__ -a ${{ secrets.PYPI_TOKEN }} diff --git a/.gitignore b/.gitignore deleted file mode 100644 index e3f1b4ea3b2..00000000000 --- a/.gitignore +++ /dev/null @@ -1,138 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -.pixi/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# PyCharm project settings -.idea/ - -# VSCode project settings -.vscode/ - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ -/poetry.lock -.idea/**/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index fa58a732dfd..00000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,33 +0,0 @@ -ci: - autofix_prs: false - skip: [pyright] - -default_language_version: - python: python3.11 - node: 23.9.0 - -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-merge-conflict - - id: check-case-conflict - - id: check-toml - - id: check-yaml - - id: check-ast - - id: debug-statements - - id: check-docstring-first - - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.3 - hooks: - - id: ruff - args: [--fix] - - id: ruff-format - - - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.403 - hooks: - - id: pyright diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index fd680cbde25..00000000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,128 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, religion, or sexual identity -and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the - overall community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or - advances of any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email - address, without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at -. -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series -of actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or -permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within -the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.0, available at -. - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see the FAQ at -. Translations are available at -. diff --git a/README.md b/README.md deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/pixi.lock b/pixi.lock deleted file mode 100644 index 033527552a9..00000000000 --- a/pixi.lock +++ /dev/null @@ -1,1870 +0,0 @@ -version: 6 -environments: - default: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.2-hee588c1_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.12-h9e4cc4f_0_cpython.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: ./ - osx-64: - - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.7.0-h240833e_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.6-h281671d_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.1-hd471939_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.49.2-hdb6dae5_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.5.0-hc426f3f_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.12-h9ccd52b_0_cpython.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: ./ - osx-arm64: - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.7.0-h286801f_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.6-h1da3d7d_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.1-h39f12f2_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.49.2-h3f77e49_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.5.0-h81ee809_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.12-hc22306f_0_cpython.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: ./ - win-64: - - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.0-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.6-h537db12_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.49.2-h67fdade_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.5.0-ha4e3fda_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.12-h3f84c4b_0_cpython.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - - pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: ./ -packages: -- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 - md5: d7c89558ba9fa0495403155b64376d81 - license: None - purls: [] - size: 2562 - timestamp: 1578324546067 -- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - build_number: 16 - sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 - md5: 73aaf86a425cc6e73fcf236a5a46396d - depends: - - _libgcc_mutex 0.1 conda_forge - - libgomp >=7.5.0 - constrains: - - openmp_impl 9999 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 23621 - timestamp: 1650670423406 -- pypi: https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl - name: asttokens - version: 2.4.1 - sha256: 051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 - requires_dist: - - six>=1.12.0 - - typing ; python_full_version < '3.5' - - astroid>=1,<2 ; python_full_version < '3' and extra == 'astroid' - - astroid>=2,<4 ; python_full_version >= '3' and extra == 'astroid' - - pytest ; extra == 'test' - - astroid>=1,<2 ; python_full_version < '3' and extra == 'test' - - astroid>=2,<4 ; python_full_version >= '3' and extra == 'test' -- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d - md5: 62ee74e96c5ebb0af99386de58cf9553 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc-ng >=12 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 252783 - timestamp: 1720974456583 -- conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda - sha256: cad153608b81fb24fc8c509357daa9ae4e49dfc535b2cb49b91e23dbd68fc3c5 - md5: 7ed4301d437b59045be7e051a0308211 - depends: - - __osx >=10.13 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 134188 - timestamp: 1720974491916 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h99b78c6_7.conda - sha256: adfa71f158cbd872a36394c56c3568e6034aa55c623634b37a4836bd036e6b91 - md5: fc6948412dbbbe9a4c9ddbbcfe0a79ab - depends: - - __osx >=11.0 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 122909 - timestamp: 1720974522888 -- conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda - sha256: 35a5dad92e88fdd7fc405e864ec239486f4f31eec229e31686e61a140a8e573b - md5: 276e7ffe9ffe39688abc665ef0f45596 - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 54927 - timestamp: 1720974860185 -- conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - sha256: afee721baa6d988e27fef1832f68d6f32ac8cc99cdf6015732224c2841a09cea - md5: c27d1c142233b5bc9ca570c6e2e0c244 - license: ISC - purls: [] - size: 159003 - timestamp: 1725018903918 -- conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.8.30-h8857fd0_0.conda - sha256: 593f302d0f44c2c771e1614ee6d56fffdc7d616e6f187669c8b0e34ffce3e1ae - md5: b7e5424e7f06547a903d28e4651dbb21 - license: ISC - purls: [] - size: 158665 - timestamp: 1725019059295 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.8.30-hf0a4a13_0.conda - sha256: 2db1733f4b644575dbbdd7994a8f338e6ef937f5ebdb74acd557e9dda0211709 - md5: 40dec13fd8348dbe303e57be74bd3d35 - license: ISC - purls: [] - size: 158482 - timestamp: 1725019034582 -- conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.8.30-h56e8100_0.conda - sha256: 0fcac3a7ffcc556649e034a1802aedf795e64227eaa7194d207b01eaf26454c4 - md5: 4c4fd67c18619be5aa65dc5b6c72e490 - license: ISC - purls: [] - size: 158773 - timestamp: 1725019107649 -- pypi: https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl - name: cfgv - version: 3.4.0 - sha256: b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl - name: colorama - version: 0.4.6 - sha256: 4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' -- pypi: https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl - name: decorator - version: 5.1.1 - sha256: b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 - requires_python: '>=3.5' -- pypi: https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl - name: distlib - version: 0.3.9 - sha256: 47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 -- pypi: https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl - name: executing - version: 2.1.0 - sha256: 8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf - requires_dist: - - asttokens>=2.1.0 ; extra == 'tests' - - ipython ; extra == 'tests' - - pytest ; extra == 'tests' - - coverage ; extra == 'tests' - - coverage-enable-subprocess ; extra == 'tests' - - littleutils ; extra == 'tests' - - rich ; python_full_version >= '3.11' and extra == 'tests' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl - name: filelock - version: 3.16.1 - sha256: 2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 - requires_dist: - - furo>=2024.8.6 ; extra == 'docs' - - sphinx-autodoc-typehints>=2.4.1 ; extra == 'docs' - - sphinx>=8.0.2 ; extra == 'docs' - - covdefaults>=2.3 ; extra == 'testing' - - coverage>=7.6.1 ; extra == 'testing' - - diff-cover>=9.2 ; extra == 'testing' - - pytest-asyncio>=0.24 ; extra == 'testing' - - pytest-cov>=5 ; extra == 'testing' - - pytest-mock>=3.14 ; extra == 'testing' - - pytest-timeout>=2.3.1 ; extra == 'testing' - - pytest>=8.3.3 ; extra == 'testing' - - virtualenv>=20.26.4 ; extra == 'testing' - - typing-extensions>=4.12.2 ; python_full_version < '3.11' and extra == 'typing' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl - name: fsspec - version: 2024.10.0 - sha256: 03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 - requires_dist: - - adlfs ; extra == 'abfs' - - adlfs ; extra == 'adl' - - pyarrow>=1 ; extra == 'arrow' - - dask ; extra == 'dask' - - distributed ; extra == 'dask' - - pre-commit ; extra == 'dev' - - ruff ; extra == 'dev' - - numpydoc ; extra == 'doc' - - sphinx ; extra == 'doc' - - sphinx-design ; extra == 'doc' - - sphinx-rtd-theme ; extra == 'doc' - - yarl ; extra == 'doc' - - dropbox ; extra == 'dropbox' - - dropboxdrivefs ; extra == 'dropbox' - - requests ; extra == 'dropbox' - - adlfs ; extra == 'full' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'full' - - dask ; extra == 'full' - - distributed ; extra == 'full' - - dropbox ; extra == 'full' - - dropboxdrivefs ; extra == 'full' - - fusepy ; extra == 'full' - - gcsfs ; extra == 'full' - - libarchive-c ; extra == 'full' - - ocifs ; extra == 'full' - - panel ; extra == 'full' - - paramiko ; extra == 'full' - - pyarrow>=1 ; extra == 'full' - - pygit2 ; extra == 'full' - - requests ; extra == 'full' - - s3fs ; extra == 'full' - - smbprotocol ; extra == 'full' - - tqdm ; extra == 'full' - - fusepy ; extra == 'fuse' - - gcsfs ; extra == 'gcs' - - pygit2 ; extra == 'git' - - requests ; extra == 'github' - - gcsfs ; extra == 'gs' - - panel ; extra == 'gui' - - pyarrow>=1 ; extra == 'hdfs' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'http' - - libarchive-c ; extra == 'libarchive' - - ocifs ; extra == 'oci' - - s3fs ; extra == 's3' - - paramiko ; extra == 'sftp' - - smbprotocol ; extra == 'smb' - - paramiko ; extra == 'ssh' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test' - - numpy ; extra == 'test' - - pytest ; extra == 'test' - - pytest-asyncio!=0.22.0 ; extra == 'test' - - pytest-benchmark ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-mock ; extra == 'test' - - pytest-recording ; extra == 'test' - - pytest-rerunfailures ; extra == 'test' - - requests ; extra == 'test' - - aiobotocore>=2.5.4,<3.0.0 ; extra == 'test-downstream' - - dask-expr ; extra == 'test-downstream' - - dask[dataframe,test] ; extra == 'test-downstream' - - moto[server]>4,<5 ; extra == 'test-downstream' - - pytest-timeout ; extra == 'test-downstream' - - xarray ; extra == 'test-downstream' - - adlfs ; extra == 'test-full' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test-full' - - cloudpickle ; extra == 'test-full' - - dask ; extra == 'test-full' - - distributed ; extra == 'test-full' - - dropbox ; extra == 'test-full' - - dropboxdrivefs ; extra == 'test-full' - - fastparquet ; extra == 'test-full' - - fusepy ; extra == 'test-full' - - gcsfs ; extra == 'test-full' - - jinja2 ; extra == 'test-full' - - kerchunk ; extra == 'test-full' - - libarchive-c ; extra == 'test-full' - - lz4 ; extra == 'test-full' - - notebook ; extra == 'test-full' - - numpy ; extra == 'test-full' - - ocifs ; extra == 'test-full' - - pandas ; extra == 'test-full' - - panel ; extra == 'test-full' - - paramiko ; extra == 'test-full' - - pyarrow ; extra == 'test-full' - - pyarrow>=1 ; extra == 'test-full' - - pyftpdlib ; extra == 'test-full' - - pygit2 ; extra == 'test-full' - - pytest ; extra == 'test-full' - - pytest-asyncio!=0.22.0 ; extra == 'test-full' - - pytest-benchmark ; extra == 'test-full' - - pytest-cov ; extra == 'test-full' - - pytest-mock ; extra == 'test-full' - - pytest-recording ; extra == 'test-full' - - pytest-rerunfailures ; extra == 'test-full' - - python-snappy ; extra == 'test-full' - - requests ; extra == 'test-full' - - smbprotocol ; extra == 'test-full' - - tqdm ; extra == 'test-full' - - urllib3 ; extra == 'test-full' - - zarr ; extra == 'test-full' - - zstandard ; extra == 'test-full' - - tqdm ; extra == 'tqdm' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/0c/8b/90e80904fdc24ce33f6fc6f35ebd2232fe731a8528a22008458cf197bc4d/hatchling-1.25.0-py3-none-any.whl - name: hatchling - version: 1.25.0 - sha256: b47948e45d4d973034584dd4cb39c14b6a70227cf287ab7ec0ad7983408a882c - requires_dist: - - packaging>=23.2 - - pathspec>=0.10.1 - - pluggy>=1.0.0 - - tomli>=1.2.2 ; python_full_version < '3.11' - - trove-classifiers - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl - name: identify - version: 2.6.1 - sha256: 53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0 - requires_dist: - - ukkonen ; extra == 'license' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/f4/3a/5d8680279ada9571de8469220069d27024ee47624af534e537c9ff49a450/ipython-8.28.0-py3-none-any.whl - name: ipython - version: 8.28.0 - sha256: 530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35 - requires_dist: - - decorator - - jedi>=0.16 - - matplotlib-inline - - prompt-toolkit>=3.0.41,<3.1.0 - - pygments>=2.4.0 - - stack-data - - traitlets>=5.13.0 - - exceptiongroup ; python_full_version < '3.11' - - typing-extensions>=4.6 ; python_full_version < '3.12' - - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' - - colorama ; sys_platform == 'win32' - - ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole] ; extra == 'all' - - ipython[test,test-extra] ; extra == 'all' - - black ; extra == 'black' - - docrepr ; extra == 'doc' - - exceptiongroup ; extra == 'doc' - - intersphinx-registry ; extra == 'doc' - - ipykernel ; extra == 'doc' - - ipython[test] ; extra == 'doc' - - matplotlib ; extra == 'doc' - - setuptools>=18.5 ; extra == 'doc' - - sphinx-rtd-theme ; extra == 'doc' - - sphinx>=1.3 ; extra == 'doc' - - sphinxcontrib-jquery ; extra == 'doc' - - typing-extensions ; extra == 'doc' - - tomli ; python_full_version < '3.11' and extra == 'doc' - - ipykernel ; extra == 'kernel' - - matplotlib ; extra == 'matplotlib' - - nbconvert ; extra == 'nbconvert' - - nbformat ; extra == 'nbformat' - - ipywidgets ; extra == 'notebook' - - notebook ; extra == 'notebook' - - ipyparallel ; extra == 'parallel' - - qtconsole ; extra == 'qtconsole' - - pytest ; extra == 'test' - - pytest-asyncio<0.22 ; extra == 'test' - - testpath ; extra == 'test' - - pickleshare ; extra == 'test' - - packaging ; extra == 'test' - - ipython[test] ; extra == 'test-extra' - - curio ; extra == 'test-extra' - - matplotlib!=3.2.0 ; extra == 'test-extra' - - nbformat ; extra == 'test-extra' - - numpy>=1.23 ; extra == 'test-extra' - - pandas ; extra == 'test-extra' - - trio ; extra == 'test-extra' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl - name: jedi - version: 0.19.1 - sha256: e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0 - requires_dist: - - parso>=0.8.3,<0.9.0 - - jinja2==2.11.3 ; extra == 'docs' - - markupsafe==1.1.1 ; extra == 'docs' - - pygments==2.8.1 ; extra == 'docs' - - alabaster==0.7.12 ; extra == 'docs' - - babel==2.9.1 ; extra == 'docs' - - chardet==4.0.0 ; extra == 'docs' - - commonmark==0.8.1 ; extra == 'docs' - - docutils==0.17.1 ; extra == 'docs' - - future==0.18.2 ; extra == 'docs' - - idna==2.10 ; extra == 'docs' - - imagesize==1.2.0 ; extra == 'docs' - - mock==1.0.1 ; extra == 'docs' - - packaging==20.9 ; extra == 'docs' - - pyparsing==2.4.7 ; extra == 'docs' - - pytz==2021.1 ; extra == 'docs' - - readthedocs-sphinx-ext==2.1.4 ; extra == 'docs' - - recommonmark==0.5.0 ; extra == 'docs' - - requests==2.25.1 ; extra == 'docs' - - six==1.15.0 ; extra == 'docs' - - snowballstemmer==2.1.0 ; extra == 'docs' - - sphinx-rtd-theme==0.4.3 ; extra == 'docs' - - sphinx==1.8.5 ; extra == 'docs' - - sphinxcontrib-serializinghtml==1.1.4 ; extra == 'docs' - - sphinxcontrib-websupport==1.2.4 ; extra == 'docs' - - urllib3==1.26.4 ; extra == 'docs' - - flake8==5.0.4 ; extra == 'qa' - - mypy==0.971 ; extra == 'qa' - - types-setuptools==67.2.0.1 ; extra == 'qa' - - django ; extra == 'testing' - - attrs ; extra == 'testing' - - colorama ; extra == 'testing' - - docopt ; extra == 'testing' - - pytest<7.0.0 ; extra == 'testing' - requires_python: '>=3.6' -- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_1.conda - sha256: 0c21387f9a411e3d1f7f2969026bacfece133c8f1e72faea9cde29c0c19e1f3a - md5: 83e1364586ceb8d0739fbc85b5c95837 - depends: - - __glibc >=2.17,<3.0.a0 - constrains: - - binutils_impl_linux-64 2.43 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 669616 - timestamp: 1727304687962 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda - sha256: 33ab03438aee65d6aa667cf7d90c91e5e7d734c19a67aa4c7040742c0a13d505 - md5: db0bfbe7dd197b68ad5f30333bae6ce0 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - expat 2.7.0.* - license: MIT - license_family: MIT - purls: [] - size: 74427 - timestamp: 1743431794976 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.7.0-h240833e_0.conda - sha256: 976f2e23ad2bb2b8e92c99bfa2ead3ad557b17a129b170f7e2dfcf233193dd7e - md5: 026d0a1056ba2a3dbbea6d4b08188676 - depends: - - __osx >=10.13 - constrains: - - expat 2.7.0.* - license: MIT - license_family: MIT - purls: [] - size: 71894 - timestamp: 1743431912423 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.7.0-h286801f_0.conda - sha256: ee550e44765a7bbcb2a0216c063dcd53ac914a7be5386dd0554bd06e6be61840 - md5: 6934bbb74380e045741eb8637641a65b - depends: - - __osx >=11.0 - constrains: - - expat 2.7.0.* - license: MIT - license_family: MIT - purls: [] - size: 65714 - timestamp: 1743431789879 -- conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.0-he0c23c2_0.conda - sha256: 1a227c094a4e06bd54e8c2f3ec40c17ff99dcf3037d812294f842210aa66dbeb - md5: b6f5352fdb525662f4169a0431d2dd7a - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - constrains: - - expat 2.7.0.* - license: MIT - license_family: MIT - purls: [] - size: 140896 - timestamp: 1743432122520 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - sha256: 764432d32db45466e87f10621db5b74363a9f847d2b8b1f9743746cd160f06ab - md5: ede4673863426c0883c0063d853bbd85 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: MIT - license_family: MIT - purls: [] - size: 57433 - timestamp: 1743434498161 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.6-h281671d_1.conda - sha256: 6394b1bc67c64a21a5cc73d1736d1d4193a64515152e861785c44d2cfc49edf3 - md5: 4ca9ea59839a9ca8df84170fab4ceb41 - depends: - - __osx >=10.13 - license: MIT - license_family: MIT - purls: [] - size: 51216 - timestamp: 1743434595269 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.6-h1da3d7d_1.conda - sha256: c6a530924a9b14e193ea9adfe92843de2a806d1b7dbfd341546ece9653129e60 - md5: c215a60c2935b517dcda8cad4705734d - depends: - - __osx >=11.0 - license: MIT - license_family: MIT - purls: [] - size: 39839 - timestamp: 1743434670405 -- conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.6-h537db12_1.conda - sha256: d3b0b8812eab553d3464bbd68204f007f1ebadf96ce30eb0cbc5159f72e353f5 - md5: 85d8fa5e55ed8f93f874b3b23ed54ec6 - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: MIT - license_family: MIT - purls: [] - size: 44978 - timestamp: 1743435053850 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda - sha256: 53eb8a79365e58849e7b1a068d31f4f9e718dc938d6f2c03e960345739a03569 - md5: 3cb76c3f10d3bc7f1105b2fc9db984df - depends: - - _libgcc_mutex 0.1 conda_forge - - _openmp_mutex >=4.5 - constrains: - - libgomp 14.2.0 h77fa898_1 - - libgcc-ng ==14.2.0=*_1 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 848745 - timestamp: 1729027721139 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda - sha256: 3a76969c80e9af8b6e7a55090088bc41da4cffcde9e2c71b17f44d37b7cb87f7 - md5: e39480b9ca41323497b05492a63bc35b - depends: - - libgcc 14.2.0 h77fa898_1 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 54142 - timestamp: 1729027726517 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda - sha256: 1911c29975ec99b6b906904040c855772ccb265a1c79d5d75c8ceec4ed89cd63 - md5: cc3573974587f12dda90d96e3e55a702 - depends: - - _libgcc_mutex 0.1 conda_forge - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 460992 - timestamp: 1729027639220 -- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda - sha256: eeff241bddc8f1b87567dd6507c9f441f7f472c27f0860a07628260c000ef27c - md5: a76fd702c93cd2dfd89eff30a5fd45a8 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - xz 5.8.1.* - - xz ==5.8.1=*_1 - license: 0BSD - purls: [] - size: 112845 - timestamp: 1746531470399 -- conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.1-hd471939_1.conda - sha256: 20a4c5291f3e338548013623bb1dc8ee2fba5dbac8f77acaddd730ed2a7d29b6 - md5: f87e8821e0e38a4140a7ed4f52530053 - depends: - - __osx >=10.13 - constrains: - - xz 5.8.1.* - - xz ==5.8.1=*_1 - license: 0BSD - purls: [] - size: 104814 - timestamp: 1746531577001 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.1-h39f12f2_1.conda - sha256: 5ab62c179229640c34491a7de806ad4ab7bec47ea2b5fc2136e3b8cf5ef26a57 - md5: 4e8ef3d79c97c9021b34d682c24c2044 - depends: - - __osx >=11.0 - constrains: - - xz 5.8.1.* - - xz ==5.8.1=*_1 - license: 0BSD - purls: [] - size: 92218 - timestamp: 1746531818330 -- conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_1.conda - sha256: adbf6c7bde70536ada734a81b8b5aa23654f2b95445204404622e0cc40e921a0 - md5: 14a1042c163181e143a7522dfb8ad6ab - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - constrains: - - xz 5.8.1.* - - xz ==5.8.1=*_1 - license: 0BSD - purls: [] - size: 104699 - timestamp: 1746531718026 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda - sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6 - md5: 30fd6e37fe21f86f4bd26d6ee73eeec7 - depends: - - libgcc-ng >=12 - license: LGPL-2.1-only - license_family: GPL - purls: [] - size: 33408 - timestamp: 1697359010159 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.2-hee588c1_0.conda - sha256: 525d4a0e24843f90b3ff1ed733f0a2e408aa6dd18b9d4f15465595e078e104a2 - md5: 93048463501053a00739215ea3f36324 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - libzlib >=1.3.1,<2.0a0 - license: Unlicense - purls: [] - size: 916313 - timestamp: 1746637007836 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.49.2-hdb6dae5_0.conda - sha256: 8fd9562478b4d1dc90ab2bcad5289ee2b5a971ca8ad87e6b137ce0ca53bf801d - md5: 9377ba1ade655ea3fc831b456f4a2351 - depends: - - __osx >=10.13 - - libzlib >=1.3.1,<2.0a0 - license: Unlicense - purls: [] - size: 977388 - timestamp: 1746637093883 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.49.2-h3f77e49_0.conda - sha256: d89f979497cf56eccb099b6ab9558da7bba1f1ba264f50af554e0ea293d9dcf9 - md5: 85f443033cd5b3df82b5cabf79bddb09 - depends: - - __osx >=11.0 - - libzlib >=1.3.1,<2.0a0 - license: Unlicense - purls: [] - size: 899462 - timestamp: 1746637228408 -- conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.49.2-h67fdade_0.conda - sha256: 1612baa49124ec1972b085ab9d6bf1855c5f38e8f16e8d8f96c193d6e11688b2 - md5: a3900c97ba9e03332e9a911fe63f7d64 - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: Unlicense - purls: [] - size: 1081123 - timestamp: 1746637406471 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 - md5: 40b61aab5c7ba9ff276c41cfffe6b80b - depends: - - libgcc-ng >=12 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 33601 - timestamp: 1680112270483 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c - md5: 5aa797f8787fe7a17d1b0821485b5adc - depends: - - libgcc-ng >=12 - license: LGPL-2.1-or-later - purls: [] - size: 100393 - timestamp: 1702724383534 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 - md5: edb0dca6bc32e4f4789199455a1dbeb8 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - purls: [] - size: 60963 - timestamp: 1727963148474 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda - sha256: 8412f96504fc5993a63edf1e211d042a1fd5b1d51dedec755d2058948fcced09 - md5: 003a54a4e32b02f7355b50a837e699da - depends: - - __osx >=10.13 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - purls: [] - size: 57133 - timestamp: 1727963183990 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda - sha256: ce34669eadaba351cd54910743e6a2261b67009624dbc7daeeafdef93616711b - md5: 369964e85dc26bfe78f41399b366c435 - depends: - - __osx >=11.0 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - purls: [] - size: 46438 - timestamp: 1727963202283 -- conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda - sha256: ba945c6493449bed0e6e29883c4943817f7c79cbff52b83360f7b341277c6402 - md5: 41fbfac52c601159df6c01f875de31b9 - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - purls: [] - size: 55476 - timestamp: 1727963768015 -- pypi: https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl - name: llvmlite - version: 0.44.0 - sha256: d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: llvmlite - version: 0.44.0 - sha256: c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl - name: llvmlite - version: 0.44.0 - sha256: eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl - name: llvmlite - version: 0.44.0 - sha256: ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - name: matplotlib-inline - version: 0.1.7 - sha256: df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca - requires_dist: - - traitlets - requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda - sha256: 6a1d5d8634c1a07913f1c525db6455918cbc589d745fac46d9d6e30340c8731a - md5: 70caf8bb6cf39a0b6b7efc885f51c0fe - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc-ng >=12 - license: X11 AND BSD-3-Clause - purls: [] - size: 889086 - timestamp: 1724658547447 -- conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda - sha256: b0b3180039ef19502525a2abd5833c00f9624af830fd391f851934d57bffb9af - md5: e102bbf8a6ceeaf429deab8032fc8977 - depends: - - __osx >=10.13 - license: X11 AND BSD-3-Clause - purls: [] - size: 822066 - timestamp: 1724658603042 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-h7bae524_1.conda - sha256: 27d0b9ff78ad46e1f3a6c96c479ab44beda5f96def88e2fe626e0a49429d8afc - md5: cb2b0ea909b97b3d70cd3921d1445e1a - depends: - - __osx >=11.0 - license: X11 AND BSD-3-Clause - purls: [] - size: 802321 - timestamp: 1724658775723 -- pypi: https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl - name: nodeenv - version: 1.9.1 - sha256: ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' -- pypi: https://files.pythonhosted.org/packages/03/b1/c07f24a759d7c9de5a7a56cdc60feb50739cdd4198822b077099698dcf35/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_11_0_arm64.whl - name: nodejs-wheel-binaries - version: 20.18.0 - sha256: f95fb0989dfc54fd6932850e589000a8d6fc902527cebe7afd747696561d94b8 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/24/65/03e263c82c2513a1f165ee7669e677ebbb95b90c141a8407fc5f79acbbd4/nodejs_wheel_binaries-20.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: nodejs-wheel-binaries - version: 20.18.0 - sha256: 33b138288dbeb9aafc6d54f43fbca6545b37e8fd9cbb8f68275ff2a47d4fed07 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/52/bd/3a87efc6c746487b9996515adf477a908f33dbd47b5a0865e4e0e1c8b11e/nodejs_wheel_binaries-20.18.0-py2.py3-none-macosx_10_15_x86_64.whl - name: nodejs-wheel-binaries - version: 20.18.0 - sha256: 74273eab1c2423c04d034d3f707f517da32d3a2b20ca244b5667f3a4e38003ac - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/d0/90/921823227b4d49b9dadf9f38d072b5f28f883b0f83e697489de0f9c24674/nodejs_wheel_binaries-20.18.0-py2.py3-none-win_amd64.whl - name: nodejs-wheel-binaries - version: 20.18.0 - sha256: 51c0cecb429a111351a54346909e672a57b96233a363c79cc0a2bbdbfa397304 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl - name: numba - version: 0.61.2 - sha256: 76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1 - requires_dist: - - llvmlite>=0.44.0.dev0,<0.45 - - numpy>=1.24,<2.3 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl - name: numba - version: 0.61.2 - sha256: efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 - requires_dist: - - llvmlite>=0.44.0.dev0,<0.45 - - numpy>=1.24,<2.3 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl - name: numba - version: 0.61.2 - sha256: 49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b - requires_dist: - - llvmlite>=0.44.0.dev0,<0.45 - - numpy>=1.24,<2.3 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - name: numba - version: 0.61.2 - sha256: 3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60 - requires_dist: - - llvmlite>=0.44.0.dev0,<0.45 - - numpy>=1.24,<2.3 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl - name: numpy - version: 2.1.2 - sha256: faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: numpy - version: 2.1.2 - sha256: e2b49c3c0804e8ecb05d59af8386ec2f74877f7ca8fd9c1e00be2672e4d399b1 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl - name: numpy - version: 2.1.2 - sha256: b42a1a511c81cc78cbc4539675713bbcf9d9c3913386243ceff0e9429ca892fe - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl - name: numpy - version: 2.1.2 - sha256: f1eb068ead09f4994dec71c24b2844f1e4e4e013b9629f812f292f04bd1510d9 - requires_python: '>=3.10' -- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda - sha256: b4491077c494dbf0b5eaa6d87738c22f2154e9277e5293175ec187634bd808a0 - md5: de356753cfdbffcde5bb1e86e3aa6cd0 - depends: - - __glibc >=2.17,<3.0.a0 - - ca-certificates - - libgcc >=13 - license: Apache-2.0 - license_family: Apache - purls: [] - size: 3117410 - timestamp: 1746223723843 -- conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.5.0-hc426f3f_1.conda - sha256: bcac94cb82a458b4e3164da8d9bced08cc8c3da2bc3bd7330711a3689c1464a5 - md5: 919faa07b9647beb99a0e7404596a465 - depends: - - __osx >=10.13 - - ca-certificates - license: Apache-2.0 - license_family: Apache - purls: [] - size: 2739181 - timestamp: 1746224401118 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.5.0-h81ee809_1.conda - sha256: 73d366c1597a10bcd5f3604b5f0734b31c23225536e03782c6a13f9be9d01bff - md5: 5c7aef00ef60738a14e0e612cfc5bcde - depends: - - __osx >=11.0 - - ca-certificates - license: Apache-2.0 - license_family: Apache - purls: [] - size: 3064197 - timestamp: 1746223530698 -- conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.5.0-ha4e3fda_1.conda - sha256: 02846553d2a4c9bde850c60824d0f02803eb9c9b674d5c1a8cce25bc387e748f - md5: 72c07e46b6766bb057018a9a74861b89 - depends: - - ca-certificates - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: Apache-2.0 - license_family: Apache - purls: [] - size: 9025176 - timestamp: 1746227349882 -- pypi: https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl - name: packaging - version: '24.1' - sha256: 5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/a3/be/d9ba3109c4c19a78e125f63074c4e436e447f30ece15f0ef1865e7178233/pandas_stubs-2.2.3.241009-py3-none-any.whl - name: pandas-stubs - version: 2.2.3.241009 - sha256: 3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa - requires_dist: - - numpy>=1.23.5 - - types-pytz>=2022.1.1 - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - name: parso - version: 0.8.4 - sha256: a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18 - requires_dist: - - flake8==5.0.4 ; extra == 'qa' - - mypy==0.971 ; extra == 'qa' - - types-setuptools==67.2.0.1 ; extra == 'qa' - - docopt ; extra == 'testing' - - pytest ; extra == 'testing' - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - name: pathspec - version: 0.12.1 - sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - name: pexpect - version: 4.9.0 - sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 - requires_dist: - - ptyprocess>=0.5 -- conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.2-pyh8b19718_1.conda - sha256: d820e5358bcb117fa6286e55d4550c60b0332443df62121df839eab2d11c890b - md5: 6c78fbb8ddfd64bcb55b5cbafd2d2c43 - depends: - - python >=3.8,<3.13.0a0 - - setuptools - - wheel - license: MIT - license_family: MIT - purls: - - pkg:pypi/pip?source=hash-mapping - size: 1237976 - timestamp: 1724954490262 -- pypi: https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl - name: platformdirs - version: 4.3.6 - sha256: 73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb - requires_dist: - - furo>=2024.8.6 ; extra == 'docs' - - proselint>=0.14 ; extra == 'docs' - - sphinx-autodoc-typehints>=2.4 ; extra == 'docs' - - sphinx>=8.0.2 ; extra == 'docs' - - appdirs==1.4.4 ; extra == 'test' - - covdefaults>=2.3 ; extra == 'test' - - pytest-cov>=5 ; extra == 'test' - - pytest-mock>=3.14 ; extra == 'test' - - pytest>=8.3.2 ; extra == 'test' - - mypy>=1.11.2 ; extra == 'type' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl - name: pluggy - version: 1.5.0 - sha256: 44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 - requires_dist: - - pre-commit ; extra == 'dev' - - tox ; extra == 'dev' - - pytest ; extra == 'testing' - - pytest-benchmark ; extra == 'testing' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl - name: pre-commit - version: 4.0.1 - sha256: efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878 - requires_dist: - - cfgv>=2.0.0 - - identify>=1.0.0 - - nodeenv>=0.11.1 - - pyyaml>=5.1 - - virtualenv>=20.10.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl - name: prompt-toolkit - version: 3.0.48 - sha256: f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e - requires_dist: - - wcwidth - requires_python: '>=3.7.0' -- pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - name: ptyprocess - version: 0.7.0 - sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 -- pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - name: pure-eval - version: 0.2.3 - sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 - requires_dist: - - pytest ; extra == 'tests' -- pypi: https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl - name: pyarrow - version: 20.0.0 - sha256: a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62 - requires_dist: - - pytest ; extra == 'test' - - hypothesis ; extra == 'test' - - cffi ; extra == 'test' - - pytz ; extra == 'test' - - pandas ; extra == 'test' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl - name: pyarrow - version: 20.0.0 - sha256: 24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0 - requires_dist: - - pytest ; extra == 'test' - - hypothesis ; extra == 'test' - - cffi ; extra == 'test' - - pytz ; extra == 'test' - - pandas ; extra == 'test' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl - name: pyarrow - version: 20.0.0 - sha256: 3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc - requires_dist: - - pytest ; extra == 'test' - - hypothesis ; extra == 'test' - - cffi ; extra == 'test' - - pytz ; extra == 'test' - - pandas ; extra == 'test' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl - name: pyarrow - version: 20.0.0 - sha256: 95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb - requires_dist: - - pytest ; extra == 'test' - - hypothesis ; extra == 'test' - - cffi ; extra == 'test' - - pytz ; extra == 'test' - - pandas ; extra == 'test' - requires_python: '>=3.9' -- pypi: ./ - name: pyarrow-stubs - version: 20.0.0.20250716 - sha256: a69c85a5072346ec9e350e151f522b6b522b1083b6e85c5adb3fb51975ac8c56 - requires_dist: - - pyarrow>=20 - requires_python: '>=3.9,<4' - editable: true -- pypi: https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl - name: pygments - version: 2.18.0 - sha256: b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a - requires_dist: - - colorama>=0.4.6 ; extra == 'windows-terminal' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/e3/39/877484412a1079003a7645375b487bd7c422692f4e5b7c2030dea3e83043/pyright-1.1.385-py3-none-any.whl - name: pyright - version: 1.1.385 - sha256: e5b9a1b8d492e13004d822af94d07d235f2c7c158457293b51ab2214c8c5b375 - requires_dist: - - nodeenv>=1.6.0 - - typing-extensions>=4.1 - - twine>=3.4.1 ; extra == 'all' - - nodejs-wheel-binaries ; extra == 'all' - - twine>=3.4.1 ; extra == 'dev' - - nodejs-wheel-binaries ; extra == 'nodejs' - requires_python: '>=3.7' -- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.12-h9e4cc4f_0_cpython.conda - sha256: 028a03968eb101a681fa4966b2c52e93c8db1e934861f8d108224f51ba2c1bc9 - md5: b61d4fbf583b8393d9d00ec106ad3658 - depends: - - __glibc >=2.17,<3.0.a0 - - bzip2 >=1.0.8,<2.0a0 - - ld_impl_linux-64 >=2.36.1 - - libexpat >=2.7.0,<3.0a0 - - libffi >=3.4.6,<3.5.0a0 - - libgcc >=13 - - liblzma >=5.8.1,<6.0a0 - - libnsl >=2.0.1,<2.1.0a0 - - libsqlite >=3.49.1,<4.0a0 - - libuuid >=2.38.1,<3.0a0 - - libxcrypt >=4.4.36 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.5.0,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - constrains: - - python_abi 3.11.* *_cp311 - license: Python-2.0 - purls: [] - size: 30545496 - timestamp: 1744325586785 -- conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.12-h9ccd52b_0_cpython.conda - sha256: fcd4b8a9a206940321d1d6569ddac2e99f359f0d5864e48140374a85aed5c27f - md5: cfa36957cba60dca8e79a974d09b6a2c - depends: - - __osx >=10.13 - - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.7.0,<3.0a0 - - libffi >=3.4.6,<3.5.0a0 - - liblzma >=5.8.1,<6.0a0 - - libsqlite >=3.49.1,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.5.0,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - constrains: - - python_abi 3.11.* *_cp311 - license: Python-2.0 - purls: [] - size: 15467842 - timestamp: 1744324543915 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.11.12-hc22306f_0_cpython.conda - sha256: ea91eb5bc7160cbc6f8110702f9250c87e378ff1dc83ab8daa8ae7832fb5d0de - md5: 6ab5f6a9e85f1b1848b6518e7eea63ee - depends: - - __osx >=11.0 - - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.7.0,<3.0a0 - - libffi >=3.4.6,<3.5.0a0 - - liblzma >=5.8.1,<6.0a0 - - libsqlite >=3.49.1,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.5.0,<4.0a0 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - constrains: - - python_abi 3.11.* *_cp311 - license: Python-2.0 - purls: [] - size: 13584762 - timestamp: 1744323773319 -- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.12-h3f84c4b_0_cpython.conda - sha256: 41e1c07eecff9436b9bb27724822229b2da6073af8461ede6c81b508c0677c56 - md5: c1f91331274f591340e2f50e737dfbe9 - depends: - - bzip2 >=1.0.8,<2.0a0 - - libexpat >=2.7.0,<3.0a0 - - libffi >=3.4.6,<3.5.0a0 - - liblzma >=5.8.1,<6.0a0 - - libsqlite >=3.49.1,<4.0a0 - - libzlib >=1.3.1,<2.0a0 - - openssl >=3.5.0,<4.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - constrains: - - python_abi 3.11.* *_cp311 - license: Python-2.0 - purls: [] - size: 18299489 - timestamp: 1744323460367 -- pypi: https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: pyyaml - version: 6.0.2 - sha256: 3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl - name: pyyaml - version: 6.0.2 - sha256: 1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl - name: pyyaml - version: 6.0.2 - sha256: e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl - name: pyyaml - version: 6.0.2 - sha256: cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 - requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda - sha256: 5435cf39d039387fbdc977b0a762357ea909a7694d9528ab40f005e9208744d7 - md5: 47d31b792659ce70f470b5c82fdfb7a4 - depends: - - libgcc-ng >=12 - - ncurses >=6.3,<7.0a0 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 281456 - timestamp: 1679532220005 -- conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda - sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568 - md5: f17f77f2acf4d344734bda76829ce14e - depends: - - ncurses >=6.3,<7.0a0 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 255870 - timestamp: 1679532707590 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda - sha256: a1dfa679ac3f6007362386576a704ad2d0d7a02e98f5d0b115f207a2da63e884 - md5: 8cbb776a2f641b943d413b3e19df71f4 - depends: - - ncurses >=6.3,<7.0a0 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 250351 - timestamp: 1679532511311 -- pypi: https://files.pythonhosted.org/packages/39/9f/c5ee2b40d377354dabcc23cff47eb299de4b4d06d345068f8f8cc1eadac8/ruff-0.7.0-py3-none-win_amd64.whl - name: ruff - version: 0.7.0 - sha256: ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/46/96/464058dd1d980014fb5aa0a1254e78799efb3096fc7a4823cd66a1621276/ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: ruff - version: 0.7.0 - sha256: d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/57/1d/e5cc149ecc46e4f203403a79ccd170fad52d316f98b87d0f63b1945567db/ruff-0.7.0-py3-none-macosx_11_0_arm64.whl - name: ruff - version: 0.7.0 - sha256: 214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/cd/94/da0ba5f956d04c90dd899209904210600009dcda039ce840d83eb4298c7d/ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl - name: ruff - version: 0.7.0 - sha256: 496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/93/6b/701776d4bd6bdd9b629c387b5140f006185bd8ddea16788a44434376b98f/scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: scipy - version: 1.14.1 - sha256: fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2 - requires_dist: - - numpy>=1.23.5,<2.3 - - pytest ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest-xdist ; extra == 'test' - - asv ; extra == 'test' - - mpmath ; extra == 'test' - - gmpy2 ; extra == 'test' - - threadpoolctl ; extra == 'test' - - scikit-umfpack ; extra == 'test' - - pooch ; extra == 'test' - - hypothesis>=6.30 ; extra == 'test' - - array-api-strict>=2.0 ; extra == 'test' - - cython ; extra == 'test' - - meson ; extra == 'test' - - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - - sphinx-design>=0.4.0 ; extra == 'doc' - - matplotlib>=3.5 ; extra == 'doc' - - numpydoc ; extra == 'doc' - - jupytext ; extra == 'doc' - - myst-nb ; extra == 'doc' - - pooch ; extra == 'doc' - - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' - - jupyterlite-pyodide-kernel ; extra == 'doc' - - mypy==1.10.0 ; extra == 'dev' - - typing-extensions ; extra == 'dev' - - types-psutil ; extra == 'dev' - - pycodestyle ; extra == 'dev' - - ruff>=0.0.292 ; extra == 'dev' - - cython-lint>=0.12.2 ; extra == 'dev' - - rich-click ; extra == 'dev' - - doit>=0.36.0 ; extra == 'dev' - - pydevtool ; extra == 'dev' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/a7/c5/02ac82f9bb8f70818099df7e86c3ad28dae64e1347b421d8e3adf26acab6/scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl - name: scipy - version: 1.14.1 - sha256: c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 - requires_dist: - - numpy>=1.23.5,<2.3 - - pytest ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest-xdist ; extra == 'test' - - asv ; extra == 'test' - - mpmath ; extra == 'test' - - gmpy2 ; extra == 'test' - - threadpoolctl ; extra == 'test' - - scikit-umfpack ; extra == 'test' - - pooch ; extra == 'test' - - hypothesis>=6.30 ; extra == 'test' - - array-api-strict>=2.0 ; extra == 'test' - - cython ; extra == 'test' - - meson ; extra == 'test' - - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - - sphinx-design>=0.4.0 ; extra == 'doc' - - matplotlib>=3.5 ; extra == 'doc' - - numpydoc ; extra == 'doc' - - jupytext ; extra == 'doc' - - myst-nb ; extra == 'doc' - - pooch ; extra == 'doc' - - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' - - jupyterlite-pyodide-kernel ; extra == 'doc' - - mypy==1.10.0 ; extra == 'dev' - - typing-extensions ; extra == 'dev' - - types-psutil ; extra == 'dev' - - pycodestyle ; extra == 'dev' - - ruff>=0.0.292 ; extra == 'dev' - - cython-lint>=0.12.2 ; extra == 'dev' - - rich-click ; extra == 'dev' - - doit>=0.36.0 ; extra == 'dev' - - pydevtool ; extra == 'dev' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/b2/ab/070ccfabe870d9f105b04aee1e2860520460ef7ca0213172abfe871463b9/scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl - name: scipy - version: 1.14.1 - sha256: 2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 - requires_dist: - - numpy>=1.23.5,<2.3 - - pytest ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest-xdist ; extra == 'test' - - asv ; extra == 'test' - - mpmath ; extra == 'test' - - gmpy2 ; extra == 'test' - - threadpoolctl ; extra == 'test' - - scikit-umfpack ; extra == 'test' - - pooch ; extra == 'test' - - hypothesis>=6.30 ; extra == 'test' - - array-api-strict>=2.0 ; extra == 'test' - - cython ; extra == 'test' - - meson ; extra == 'test' - - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - - sphinx-design>=0.4.0 ; extra == 'doc' - - matplotlib>=3.5 ; extra == 'doc' - - numpydoc ; extra == 'doc' - - jupytext ; extra == 'doc' - - myst-nb ; extra == 'doc' - - pooch ; extra == 'doc' - - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' - - jupyterlite-pyodide-kernel ; extra == 'doc' - - mypy==1.10.0 ; extra == 'dev' - - typing-extensions ; extra == 'dev' - - types-psutil ; extra == 'dev' - - pycodestyle ; extra == 'dev' - - ruff>=0.0.292 ; extra == 'dev' - - cython-lint>=0.12.2 ; extra == 'dev' - - rich-click ; extra == 'dev' - - doit>=0.36.0 ; extra == 'dev' - - pydevtool ; extra == 'dev' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl - name: scipy - version: 1.14.1 - sha256: 716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 - requires_dist: - - numpy>=1.23.5,<2.3 - - pytest ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest-xdist ; extra == 'test' - - asv ; extra == 'test' - - mpmath ; extra == 'test' - - gmpy2 ; extra == 'test' - - threadpoolctl ; extra == 'test' - - scikit-umfpack ; extra == 'test' - - pooch ; extra == 'test' - - hypothesis>=6.30 ; extra == 'test' - - array-api-strict>=2.0 ; extra == 'test' - - cython ; extra == 'test' - - meson ; extra == 'test' - - ninja ; sys_platform != 'emscripten' and extra == 'test' - - sphinx>=5.0.0,<=7.3.7 ; extra == 'doc' - - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' - - sphinx-design>=0.4.0 ; extra == 'doc' - - matplotlib>=3.5 ; extra == 'doc' - - numpydoc ; extra == 'doc' - - jupytext ; extra == 'doc' - - myst-nb ; extra == 'doc' - - pooch ; extra == 'doc' - - jupyterlite-sphinx>=0.13.1 ; extra == 'doc' - - jupyterlite-pyodide-kernel ; extra == 'doc' - - mypy==1.10.0 ; extra == 'dev' - - typing-extensions ; extra == 'dev' - - types-psutil ; extra == 'dev' - - pycodestyle ; extra == 'dev' - - ruff>=0.0.292 ; extra == 'dev' - - cython-lint>=0.12.2 ; extra == 'dev' - - rich-click ; extra == 'dev' - - doit>=0.36.0 ; extra == 'dev' - - pydevtool ; extra == 'dev' - requires_python: '>=3.10' -- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.1.0-pyhd8ed1ab_0.conda - sha256: 6725235722095c547edd24275053c615158d6163f396550840aebd6e209e4738 - md5: d5cd48392c67fb6849ba459c2c2b671f - depends: - - python >=3.8 - license: MIT - license_family: MIT - purls: - - pkg:pypi/setuptools?source=hash-mapping - size: 777462 - timestamp: 1727249510532 -- pypi: https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl - name: six - version: 1.16.0 - sha256: 8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' -- pypi: https://files.pythonhosted.org/packages/ca/4a/e59e0968ad52460bb997221fdf5a77d1385b0258f37bfbc84675977b0a62/sparse-0.16.0-py2.py3-none-any.whl - name: sparse - version: 0.16.0 - sha256: 25d4463cf36315ee16a19b6951f1d6b8e9128a07dafd58f846eb6dfb4cd5b9d8 - requires_dist: - - numpy>=1.17 - - numba>=0.49 - - mkdocs-material ; extra == 'docs' - - mkdocstrings[python] ; extra == 'docs' - - mkdocs-gen-files ; extra == 'docs' - - mkdocs-literate-nav ; extra == 'docs' - - mkdocs-section-index ; extra == 'docs' - - mkdocs-jupyter ; extra == 'docs' - - sparse[extras] ; extra == 'docs' - - dask[array] ; extra == 'extras' - - sparse[finch] ; extra == 'extras' - - scipy ; extra == 'extras' - - scikit-learn ; extra == 'extras' - - networkx ; extra == 'extras' - - sparse[extras] ; extra == 'tests' - - pytest>=3.5 ; extra == 'tests' - - pytest-cov ; extra == 'tests' - - pytest-xdist ; extra == 'tests' - - pre-commit ; extra == 'tests' - - pytest-codspeed ; extra == 'tests' - - sparse[tests] ; extra == 'tox' - - tox ; extra == 'tox' - - sparse[tests] ; extra == 'notebooks' - - nbmake ; extra == 'notebooks' - - matplotlib ; extra == 'notebooks' - - sparse[docs,mlir,notebooks,tox] ; extra == 'all' - - matrepr ; extra == 'all' - - finch-tensor>=0.2.10 ; extra == 'finch' - - finch-mlir>=0.0.2 ; extra == 'mlir' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - name: stack-data - version: 0.6.3 - sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 - requires_dist: - - executing>=1.2.0 - - asttokens>=2.1.0 - - pure-eval - - pytest ; extra == 'tests' - - typeguard ; extra == 'tests' - - pygments ; extra == 'tests' - - littleutils ; extra == 'tests' - - cython ; extra == 'tests' -- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e - md5: d453b98d9c83e71da0741bb0ff4d76bc - depends: - - libgcc-ng >=12 - - libzlib >=1.2.13,<2.0.0a0 - license: TCL - license_family: BSD - purls: [] - size: 3318875 - timestamp: 1699202167581 -- conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda - sha256: 30412b2e9de4ff82d8c2a7e5d06a15f4f4fef1809a72138b6ccb53a33b26faf5 - md5: bf830ba5afc507c6232d4ef0fb1a882d - depends: - - libzlib >=1.2.13,<2.0.0a0 - license: TCL - license_family: BSD - purls: [] - size: 3270220 - timestamp: 1699202389792 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - sha256: 72457ad031b4c048e5891f3f6cb27a53cb479db68a52d965f796910e71a403a8 - md5: b50a57ba89c32b62428b71a875291c9b - depends: - - libzlib >=1.2.13,<2.0.0a0 - license: TCL - license_family: BSD - purls: [] - size: 3145523 - timestamp: 1699202432999 -- conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda - sha256: 2c4e914f521ccb2718946645108c9bd3fc3216ba69aea20c2c3cedbd8db32bb1 - md5: fc048363eb8f03cd1737600a5d08aafe - depends: - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 - license: TCL - license_family: BSD - purls: [] - size: 3503410 - timestamp: 1699202577803 -- pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - name: traitlets - version: 5.14.3 - sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f - requires_dist: - - myst-parser ; extra == 'docs' - - pydata-sphinx-theme ; extra == 'docs' - - sphinx ; extra == 'docs' - - argcomplete>=3.0.3 ; extra == 'test' - - mypy>=1.7.0 ; extra == 'test' - - pre-commit ; extra == 'test' - - pytest-mock ; extra == 'test' - - pytest-mypy-testing ; extra == 'test' - - pytest>=7.0,<8.2 ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/75/a0/dd773135ca0f7227e8257555fd2f7a0c88672bfd111a400361f10c09face/trove_classifiers-2024.10.16-py3-none-any.whl - name: trove-classifiers - version: 2024.10.16 - sha256: 9b02a4cb49bd2e85c13e728ee461f4f332d6334736b18d61254c964643687144 -- pypi: https://files.pythonhosted.org/packages/69/7a/98f5d2493a652cec05d3b09be59202d202004a41fca9c70d224782611365/types_cffi-1.16.0.20240331-py3-none-any.whl - name: types-cffi - version: 1.16.0.20240331 - sha256: a363e5ea54a4eb6a4a105d800685fde596bc318089b025b27dee09849fe41ff0 - requires_dist: - - types-setuptools - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/86/60/2a2977ce0f91255bbb668350b127a801a06ad37c326a2e5bfd52f03e0784/types_pytz-2024.2.0.20241003-py3-none-any.whl - name: types-pytz - version: 2024.2.0.20241003 - sha256: 3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/ad/00/a90c00f3af9f6c41788959afc440d54b9677ebc8d9e5dba0ec4914d7a997/types_setuptools-75.2.0.20241019-py3-none-any.whl - name: types-setuptools - version: 75.2.0.20241019 - sha256: 2e48ff3acd4919471e80d5e3f049cce5c177e108d5d36d2d4cee3fa4d4104258 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl - name: typing-extensions - version: 4.12.2 - sha256: 04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d - requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda - sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf - md5: 8ac3367aafb1cc0a068483c580af8015 - license: LicenseRef-Public-Domain - purls: [] - size: 122354 - timestamp: 1728047496079 -- conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda - sha256: db8dead3dd30fb1a032737554ce91e2819b43496a0db09927edf01c32b577450 - md5: 6797b005cd0f439c4c5c9ac565783700 - constrains: - - vs2015_runtime >=14.29.30037 - license: LicenseRef-MicrosoftWindowsSDK10 - purls: [] - size: 559710 - timestamp: 1728377334097 -- conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-ha32ba9b_22.conda - sha256: 2a47c5bd8bec045959afada7063feacd074ad66b170c1ea92dd139b389fcf8fd - md5: 311c9ba1dfdd2895a8cb08346ff26259 - depends: - - vc14_runtime >=14.38.33135 - track_features: - - vc14 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 17447 - timestamp: 1728400826998 -- conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-hcc2c482_22.conda - sha256: 4c669c65007f88a7cdd560192f7e6d5679d191ac71610db724e18b2410964d64 - md5: ce23a4b980ee0556a118ed96550ff3f3 - depends: - - ucrt >=10.0.20348.0 - constrains: - - vs2015_runtime 14.40.33810.* *_22 - license: LicenseRef-MicrosoftVisualCpp2015-2022Runtime - license_family: Proprietary - purls: [] - size: 750719 - timestamp: 1728401055788 -- pypi: https://files.pythonhosted.org/packages/c8/15/828ec11907aee2349a9342fa71fba4ba7f3af938162a382dd7da339dea16/virtualenv-20.27.0-py3-none-any.whl - name: virtualenv - version: 20.27.0 - sha256: 44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655 - requires_dist: - - distlib>=0.3.7,<1 - - filelock>=3.12.2,<4 - - importlib-metadata>=6.6 ; python_full_version < '3.8' - - platformdirs>=3.9.1,<5 - - furo>=2023.7.26 ; extra == 'docs' - - proselint>=0.13 ; extra == 'docs' - - sphinx>=7.1.2,!=7.3 ; extra == 'docs' - - sphinx-argparse>=0.4 ; extra == 'docs' - - sphinxcontrib-towncrier>=0.2.1a0 ; extra == 'docs' - - towncrier>=23.6 ; extra == 'docs' - - covdefaults>=2.3 ; extra == 'test' - - coverage-enable-subprocess>=1 ; extra == 'test' - - coverage>=7.2.7 ; extra == 'test' - - flaky>=3.7 ; extra == 'test' - - packaging>=23.1 ; extra == 'test' - - pytest-env>=0.8.2 ; extra == 'test' - - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') - - pytest-mock>=3.11.1 ; extra == 'test' - - pytest-randomly>=3.12 ; extra == 'test' - - pytest-timeout>=2.1 ; extra == 'test' - - pytest>=7.4 ; extra == 'test' - - setuptools>=68 ; extra == 'test' - - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - name: wcwidth - version: 0.2.13 - sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 - requires_dist: - - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' -- conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.44.0-pyhd8ed1ab_0.conda - sha256: d828764736babb4322b8102094de38074dedfc71f5ff405c9dfee89191c14ebc - md5: d44e3b085abcaef02983c6305b84b584 - depends: - - python >=3.8 - license: MIT - license_family: MIT - purls: - - pkg:pypi/wheel?source=hash-mapping - size: 58585 - timestamp: 1722797131787 diff --git a/pyarrow-stubs/__init__.pyi b/pyarrow-stubs/__init__.pyi deleted file mode 100644 index 8a0d1e870c5..00000000000 --- a/pyarrow-stubs/__init__.pyi +++ /dev/null @@ -1,656 +0,0 @@ -# ruff: noqa: F401, I001, E402 -__version__: str - -import pyarrow.lib as _lib - -_gc_enabled: bool - -from pyarrow.lib import ( - BuildInfo, - RuntimeInfo, - set_timezone_db_path, - MonthDayNano, - VersionInfo, - cpp_build_info, - cpp_version, - cpp_version_info, - runtime_info, - cpu_count, - set_cpu_count, - enable_signal_handlers, - io_thread_count, - set_io_thread_count, -) - -def show_versions() -> None: ... -def show_info() -> None: ... -def _module_is_available(module: str) -> bool: ... -def _filesystem_is_available(fs: str) -> bool: ... - -from pyarrow.lib import ( - null, - bool_, - int8, - int16, - int32, - int64, - uint8, - uint16, - uint32, - uint64, - time32, - time64, - timestamp, - date32, - date64, - duration, - month_day_nano_interval, - float16, - float32, - float64, - binary, - string, - utf8, - binary_view, - string_view, - large_binary, - large_string, - large_utf8, - decimal32, - decimal64, - decimal128, - decimal256, - list_, - large_list, - list_view, - large_list_view, - map_, - struct, - union, - sparse_union, - dense_union, - dictionary, - run_end_encoded, - json_, - uuid, - fixed_shape_tensor, - bool8, - opaque, - field, - type_for_alias, - DataType, - DictionaryType, - StructType, - ListType, - LargeListType, - FixedSizeListType, - ListViewType, - LargeListViewType, - MapType, - UnionType, - SparseUnionType, - DenseUnionType, - TimestampType, - Time32Type, - Time64Type, - DurationType, - FixedSizeBinaryType, - Decimal32Type, - Decimal64Type, - Decimal128Type, - Decimal256Type, - BaseExtensionType, - ExtensionType, - RunEndEncodedType, - FixedShapeTensorType, - Bool8Type, - UuidType, - JsonType, - OpaqueType, - PyExtensionType, - UnknownExtensionType, - register_extension_type, - unregister_extension_type, - DictionaryMemo, - KeyValueMetadata, - Field, - Schema, - schema, - unify_schemas, - Array, - Tensor, - array, - chunked_array, - record_batch, - nulls, - repeat, - SparseCOOTensor, - SparseCSRMatrix, - SparseCSCMatrix, - SparseCSFTensor, - infer_type, - from_numpy_dtype, - NullArray, - NumericArray, - IntegerArray, - FloatingPointArray, - BooleanArray, - Int8Array, - UInt8Array, - Int16Array, - UInt16Array, - Int32Array, - UInt32Array, - Int64Array, - UInt64Array, - HalfFloatArray, - FloatArray, - DoubleArray, - ListArray, - LargeListArray, - FixedSizeListArray, - ListViewArray, - LargeListViewArray, - MapArray, - UnionArray, - BinaryArray, - StringArray, - LargeBinaryArray, - LargeStringArray, - BinaryViewArray, - StringViewArray, - FixedSizeBinaryArray, - DictionaryArray, - Date32Array, - Date64Array, - TimestampArray, - Time32Array, - Time64Array, - DurationArray, - MonthDayNanoIntervalArray, - Decimal32Array, - Decimal64Array, - Decimal128Array, - Decimal256Array, - StructArray, - ExtensionArray, - RunEndEncodedArray, - FixedShapeTensorArray, - Bool8Array, - UuidArray, - JsonArray, - OpaqueArray, - scalar, - NA, - _NULL as NULL, - Scalar, - NullScalar, - BooleanScalar, - Int8Scalar, - Int16Scalar, - Int32Scalar, - Int64Scalar, - UInt8Scalar, - UInt16Scalar, - UInt32Scalar, - UInt64Scalar, - HalfFloatScalar, - FloatScalar, - DoubleScalar, - Decimal32Scalar, - Decimal64Scalar, - Decimal128Scalar, - Decimal256Scalar, - ListScalar, - LargeListScalar, - FixedSizeListScalar, - ListViewScalar, - LargeListViewScalar, - Date32Scalar, - Date64Scalar, - Time32Scalar, - Time64Scalar, - TimestampScalar, - DurationScalar, - MonthDayNanoIntervalScalar, - BinaryScalar, - LargeBinaryScalar, - BinaryViewScalar, - StringScalar, - LargeStringScalar, - StringViewScalar, - FixedSizeBinaryScalar, - DictionaryScalar, - MapScalar, - StructScalar, - UnionScalar, - RunEndEncodedScalar, - ExtensionScalar, - Bool8Scalar, - UuidScalar, - JsonScalar, - OpaqueScalar, -) - -# Buffers, allocation -from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager - -from pyarrow.lib import ( - Buffer, - ResizableBuffer, - foreign_buffer, - py_buffer, - Codec, - compress, - decompress, - allocate_buffer, -) - -from pyarrow.lib import ( - MemoryPool, - LoggingMemoryPool, - ProxyMemoryPool, - total_allocated_bytes, - set_memory_pool, - default_memory_pool, - system_memory_pool, - jemalloc_memory_pool, - mimalloc_memory_pool, - logging_memory_pool, - proxy_memory_pool, - log_memory_allocations, - jemalloc_set_decay_ms, - supported_memory_backends, -) - -# I/O -from pyarrow.lib import ( - NativeFile, - PythonFile, - BufferedInputStream, - BufferedOutputStream, - CacheOptions, - CompressedInputStream, - CompressedOutputStream, - TransformInputStream, - transcoding_input_stream, - FixedSizeBufferWriter, - BufferReader, - BufferOutputStream, - OSFile, - MemoryMappedFile, - memory_map, - create_memory_map, - MockOutputStream, - input_stream, - output_stream, - have_libhdfs, -) - -from pyarrow.lib import ( - ChunkedArray, - RecordBatch, - Table, - table, - concat_arrays, - concat_tables, - TableGroupBy, - RecordBatchReader, -) - -# Exceptions -from pyarrow.lib import ( - ArrowCancelled, - ArrowCapacityError, - ArrowException, - ArrowKeyError, - ArrowIndexError, - ArrowInvalid, - ArrowIOError, - ArrowMemoryError, - ArrowNotImplementedError, - ArrowTypeError, - ArrowSerializationError, -) - -from pyarrow.ipc import serialize_pandas, deserialize_pandas -import pyarrow.ipc as ipc - -import pyarrow.types as types - -# ---------------------------------------------------------------------- -# Deprecations - -from pyarrow.util import _deprecate_api, _deprecate_class - -from pyarrow.ipc import ( - Message, - MessageReader, - MetadataVersion, - RecordBatchFileReader, - RecordBatchFileWriter, - RecordBatchStreamReader, - RecordBatchStreamWriter, -) - -# ---------------------------------------------------------------------- -# Returning absolute path to the pyarrow include directory (if bundled, e.g. in -# wheels) -def get_include() -> str: ... -def _get_pkg_config_executable() -> str: ... -def _has_pkg_config(pkgname: str) -> bool: ... -def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... -def get_libraries() -> list[str]: ... -def create_library_symlinks() -> None: ... -def get_library_dirs() -> list[str]: ... - -__all__ = [ - "__version__", - "_lib", - "_gc_enabled", - "BuildInfo", - "RuntimeInfo", - "set_timezone_db_path", - "MonthDayNano", - "VersionInfo", - "cpp_build_info", - "cpp_version", - "cpp_version_info", - "runtime_info", - "cpu_count", - "set_cpu_count", - "enable_signal_handlers", - "io_thread_count", - "set_io_thread_count", - "show_versions", - "show_info", - "_module_is_available", - "_filesystem_is_available", - "null", - "bool_", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "time32", - "time64", - "timestamp", - "date32", - "date64", - "duration", - "month_day_nano_interval", - "float16", - "float32", - "float64", - "binary", - "string", - "utf8", - "binary_view", - "string_view", - "large_binary", - "large_string", - "large_utf8", - "decimal32", - "decimal64", - "decimal128", - "decimal256", - "list_", - "large_list", - "list_view", - "large_list_view", - "map_", - "struct", - "union", - "sparse_union", - "dense_union", - "dictionary", - "run_end_encoded", - "json_", - "uuid", - "fixed_shape_tensor", - "bool8", - "opaque", - "field", - "type_for_alias", - "DataType", - "DictionaryType", - "StructType", - "ListType", - "LargeListType", - "FixedSizeListType", - "ListViewType", - "LargeListViewType", - "MapType", - "UnionType", - "SparseUnionType", - "DenseUnionType", - "TimestampType", - "Time32Type", - "Time64Type", - "DurationType", - "FixedSizeBinaryType", - "Decimal32Type", - "Decimal64Type", - "Decimal128Type", - "Decimal256Type", - "BaseExtensionType", - "ExtensionType", - "RunEndEncodedType", - "FixedShapeTensorType", - "Bool8Type", - "UuidType", - "JsonType", - "OpaqueType", - "PyExtensionType", - "UnknownExtensionType", - "register_extension_type", - "unregister_extension_type", - "DictionaryMemo", - "KeyValueMetadata", - "Field", - "Schema", - "schema", - "unify_schemas", - "Array", - "Tensor", - "array", - "chunked_array", - "record_batch", - "nulls", - "repeat", - "SparseCOOTensor", - "SparseCSRMatrix", - "SparseCSCMatrix", - "SparseCSFTensor", - "infer_type", - "from_numpy_dtype", - "NullArray", - "NumericArray", - "IntegerArray", - "FloatingPointArray", - "BooleanArray", - "Int8Array", - "UInt8Array", - "Int16Array", - "UInt16Array", - "Int32Array", - "UInt32Array", - "Int64Array", - "UInt64Array", - "HalfFloatArray", - "FloatArray", - "DoubleArray", - "ListArray", - "LargeListArray", - "FixedSizeListArray", - "ListViewArray", - "LargeListViewArray", - "MapArray", - "UnionArray", - "BinaryArray", - "StringArray", - "LargeBinaryArray", - "LargeStringArray", - "BinaryViewArray", - "StringViewArray", - "FixedSizeBinaryArray", - "DictionaryArray", - "Date32Array", - "Date64Array", - "TimestampArray", - "Time32Array", - "Time64Array", - "DurationArray", - "MonthDayNanoIntervalArray", - "Decimal32Array", - "Decimal64Array", - "Decimal128Array", - "Decimal256Array", - "StructArray", - "ExtensionArray", - "Bool8Array", - "UuidArray", - "JsonArray", - "OpaqueArray", - "RunEndEncodedArray", - "FixedShapeTensorArray", - "scalar", - "NA", - "NULL", - "Scalar", - "NullScalar", - "BooleanScalar", - "Int8Scalar", - "Int16Scalar", - "Int32Scalar", - "Int64Scalar", - "UInt8Scalar", - "UInt16Scalar", - "UInt32Scalar", - "UInt64Scalar", - "HalfFloatScalar", - "FloatScalar", - "DoubleScalar", - "Decimal32Scalar", - "Decimal64Scalar", - "Decimal128Scalar", - "Decimal256Scalar", - "ListScalar", - "LargeListScalar", - "FixedSizeListScalar", - "ListViewScalar", - "LargeListViewScalar", - "Date32Scalar", - "Date64Scalar", - "Time32Scalar", - "Time64Scalar", - "TimestampScalar", - "DurationScalar", - "MonthDayNanoIntervalScalar", - "BinaryScalar", - "LargeBinaryScalar", - "BinaryViewScalar", - "StringScalar", - "LargeStringScalar", - "StringViewScalar", - "FixedSizeBinaryScalar", - "DictionaryScalar", - "MapScalar", - "StructScalar", - "UnionScalar", - "RunEndEncodedScalar", - "ExtensionScalar", - "Bool8Scalar", - "UuidScalar", - "JsonScalar", - "OpaqueScalar", - "DeviceAllocationType", - "Device", - "MemoryManager", - "default_cpu_memory_manager", - "Buffer", - "ResizableBuffer", - "foreign_buffer", - "py_buffer", - "Codec", - "compress", - "decompress", - "allocate_buffer", - "MemoryPool", - "LoggingMemoryPool", - "ProxyMemoryPool", - "total_allocated_bytes", - "set_memory_pool", - "default_memory_pool", - "system_memory_pool", - "jemalloc_memory_pool", - "mimalloc_memory_pool", - "logging_memory_pool", - "proxy_memory_pool", - "log_memory_allocations", - "jemalloc_set_decay_ms", - "supported_memory_backends", - "NativeFile", - "PythonFile", - "BufferedInputStream", - "BufferedOutputStream", - "CacheOptions", - "CompressedInputStream", - "CompressedOutputStream", - "TransformInputStream", - "transcoding_input_stream", - "FixedSizeBufferWriter", - "BufferReader", - "BufferOutputStream", - "OSFile", - "MemoryMappedFile", - "memory_map", - "create_memory_map", - "MockOutputStream", - "input_stream", - "output_stream", - "have_libhdfs", - "ChunkedArray", - "RecordBatch", - "Table", - "table", - "concat_arrays", - "concat_tables", - "TableGroupBy", - "RecordBatchReader", - "ArrowCancelled", - "ArrowCapacityError", - "ArrowException", - "ArrowKeyError", - "ArrowIndexError", - "ArrowInvalid", - "ArrowIOError", - "ArrowMemoryError", - "ArrowNotImplementedError", - "ArrowTypeError", - "ArrowSerializationError", - "serialize_pandas", - "deserialize_pandas", - "ipc", - "types", - "_deprecate_api", - "_deprecate_class", - "Message", - "MessageReader", - "MetadataVersion", - "RecordBatchFileReader", - "RecordBatchFileWriter", - "RecordBatchStreamReader", - "RecordBatchStreamWriter", - "get_include", - "_get_pkg_config_executable", - "_has_pkg_config", - "_read_pkg_config_variable", - "get_libraries", - "create_library_symlinks", - "get_library_dirs", -] diff --git a/pyarrow-stubs/__lib_pxi/__init__.pyi b/pyarrow-stubs/__lib_pxi/__init__.pyi deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/pyarrow-stubs/__lib_pxi/array.pyi b/pyarrow-stubs/__lib_pxi/array.pyi deleted file mode 100644 index ec1cda30a88..00000000000 --- a/pyarrow-stubs/__lib_pxi/array.pyi +++ /dev/null @@ -1,4274 +0,0 @@ -import datetime as dt -import sys - -from collections.abc import Callable -from decimal import Decimal - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import ( - Any, - Generic, - Iterable, - Iterator, - Literal, - TypeVar, - overload, -) - -import numpy as np -import pandas as pd - -from pandas.core.dtypes.base import ExtensionDtype -from pyarrow._compute import CastOptions -from pyarrow._stubs_typing import ( - ArrayLike, - Indices, - Mask, - Order, - SupportArrowArray, - SupportArrowDeviceArray, -) -from pyarrow.lib import ( - Buffer, - Device, - MemoryManager, - MemoryPool, - MonthDayNano, - Tensor, - _Weakrefable, -) -from typing_extensions import deprecated - -from . import scalar, types -from .device import DeviceAllocationType -from .scalar import NullableCollection, Scalar -from .types import ( - DataType, - Field, - MapType, - _AsPyType, - _BasicDataType, - _BasicValueT, - _DataTypeT, - _IndexT, - _RunEndType, - _Size, -) - -@overload -def array( - values: NullableCollection[bool], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BooleanArray: ... -@overload -def array( - values: NullableCollection[int], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int64Array: ... -@overload -def array( - values: NullableCollection[float], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DoubleArray: ... -@overload -def array( - values: NullableCollection[Decimal], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Decimal128Array: ... -@overload -def array( - values: NullableCollection[dict[str, Any]], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def array( - values: NullableCollection[dt.date], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date32Array: ... -@overload -def array( - values: NullableCollection[dt.time], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["us"]]: ... -@overload -def array( - values: NullableCollection[dt.timedelta], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["us"]]: ... -@overload -def array( - values: NullableCollection[MonthDayNano], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def array( - values: NullableCollection[str], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def array( - values: NullableCollection[bytes], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def array( - values: NullableCollection[list[Any]], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> ListArray[Any]: ... -@overload -def array( - values: NullableCollection[_ScalarT], - type: None = None, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[_ScalarT]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["null"] | types.NullType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> NullArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"] | types.BoolType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BooleanArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i1", "int8"] | types.Int8Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int8Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i2", "int16"] | types.Int16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int16Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i4", "int32"] | types.Int32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i8", "int64"] | types.Int64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Int64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.UInt8Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt8Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.UInt16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt16Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u4", "uint32"] | types.Uint32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.UInt64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> UInt64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> HalfFloatArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"] | types.Float32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> FloatArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"] | types.Float64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DoubleArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"] | types.StringType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary"] | types.BinaryType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_binary"] | types.LargeBinaryType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary_view"] | types.BinaryViewType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string_view"] | types.StringViewType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"] | types.Date32Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date32Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"] | types.Date64Type, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Date64Array: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time32Array[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time32Array[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Time64Array[Literal["ns"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["s"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["ms"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["us"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> DurationArray[Literal["ns"]]: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def array( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, - mask: Mask | None = None, - size: int | None = None, - from_pandas: bool | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, -) -> Array[Scalar[_DataTypeT]]: ... -def array(*args, **kawrgs): - """ - Create pyarrow.Array instance from a Python object. - - Parameters - ---------- - obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array - If both type and size are specified may be a single use iterable. If - not strongly-typed, Arrow type will be inferred for resulting array. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) - can be passed as well. - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred from - the data. - mask : array[bool], optional - Indicate which values are null (True) or not null (False). - size : int64, optional - Size of the elements. If the input is larger than size bail at this - length. For iterators, if size is larger than the input iterator this - will be treated as a "max size", but will involve an initial allocation - of size followed by a resize to the actual size (so if you know the - exact size specifying it correctly will give you better performance). - from_pandas : bool, default None - Use pandas's semantics for inferring nulls from values in - ndarray-like data. If passed, the mask tasks precedence, but - if a value is unmasked (not-null), but still null according to - pandas semantics, then it is null. Defaults to False if not - passed explicitly by user, or True if a pandas object is - passed in. - safe : bool, default True - Check for overflows or other unsafe conversions. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray - A ChunkedArray instead of an Array is returned if: - - - the object data overflowed binary storage. - - the object's ``__arrow_array__`` protocol method returned a chunked - array. - - Notes - ----- - Timezone will be preserved in the returned array for timezone-aware data, - else no timezone will be returned for naive timestamps. - Internally, UTC values are stored for timezone-aware data with the - timezone set in the data type. - - Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by - default converted as MonthDayNanoIntervalArray. relativedelta leapdays - are ignored as are all absolute fields on both objects. datetime.timedelta - can also be converted to MonthDayNanoIntervalArray but this requires - passing MonthDayNanoIntervalType explicitly. - - Converting to dictionary array will promote to a wider integer type for - indices if the number of distinct values cannot be represented, even if - the index type was explicitly set. This means that if there are more than - 127 values the returned dictionary array's index type will be at least - pa.int16() even if pa.int8() was passed to the function. Note that an - explicit index type will not be demoted even if it is wider than required. - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.array(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) - - ... - -- dictionary: - [ - "a", - "b" - ] - -- indices: - [ - 0, - 1, - 0 - ] - - >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) - - [ - 1, - null - ] - - >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) - >>> arr.type.index_type - DataType(int16) - """ - -@overload -def asarray(values: NullableCollection[bool]) -> BooleanArray: ... -@overload -def asarray(values: NullableCollection[int]) -> Int64Array: ... -@overload -def asarray(values: NullableCollection[float]) -> DoubleArray: ... -@overload -def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... -@overload -def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... -@overload -def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... -@overload -def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... -@overload -def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... -@overload -def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... -@overload -def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["null"] | types.NullType, -) -> NullArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["bool", "boolean"] | types.BoolType, -) -> BooleanArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i1", "int8"] | types.Int8Type, -) -> Int8Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i2", "int16"] | types.Int16Type, -) -> Int16Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i4", "int32"] | types.Int32Type, -) -> Int32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["i8", "int64"] | types.Int64Type, -) -> Int64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u1", "uint8"] | types.UInt8Type, -) -> UInt8Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u2", "uint16"] | types.UInt16Type, -) -> UInt16Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u4", "uint32"] | types.Uint32Type, -) -> UInt32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["u8", "uint64"] | types.UInt64Type, -) -> UInt64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, -) -> HalfFloatArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f4", "float", "float32"] | types.Float32Type, -) -> FloatArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["f8", "double", "float64"] | types.Float64Type, -) -> DoubleArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string", "str", "utf8"] | types.StringType, -) -> StringArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary"] | types.BinaryType, -) -> BinaryArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, -) -> LargeStringArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["large_binary"] | types.LargeBinaryType, -) -> LargeBinaryArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["binary_view"] | types.BinaryViewType, -) -> BinaryViewArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["string_view"] | types.StringViewType, -) -> StringViewArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date32", "date32[day]"] | types.Date32Type, -) -> Date32Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["date64", "date64[ms]"] | types.Date64Type, -) -> Date64Array: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], -) -> Time32Array[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], -) -> Time32Array[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], -) -> Time64Array[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], -) -> Time64Array[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], -) -> TimestampArray[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], -) -> TimestampArray[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], -) -> TimestampArray[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], -) -> TimestampArray[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], -) -> DurationArray[Literal["s"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], -) -> DurationArray[Literal["ms"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], -) -> DurationArray[Literal["us"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], -) -> DurationArray[Literal["ns"]]: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, -) -> MonthDayNanoIntervalArray: ... -@overload -def asarray( - values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, - type: _DataTypeT, -) -> Array[Scalar[_DataTypeT]]: ... -def asarray(*args, **kwargs): - """ - Convert to pyarrow.Array, inferring type if not provided. - - Parameters - ---------- - values : array-like - This can be a sequence, numpy.ndarray, pyarrow.Array or - pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be - a ChunkedArray, otherwise the output will be a Array. - type : string or DataType - Explicitly construct the array with this type. Attempt to cast if - indicated type is different. - - Returns - ------- - arr : Array or ChunkedArray - """ - -@overload -def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... -@overload -def nulls( - size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None -) -> NullArray: ... -@overload -def nulls( - size: int, type: types.BoolType, memory_pool: MemoryPool | None = None -) -> BooleanArray: ... -@overload -def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... -@overload -def nulls( - size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None -) -> Int16Array: ... -@overload -def nulls( - size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None -) -> Int32Array: ... -@overload -def nulls( - size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None -) -> Int64Array: ... -@overload -def nulls( - size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None -) -> UInt8Array: ... -@overload -def nulls( - size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None -) -> UInt16Array: ... -@overload -def nulls( - size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None -) -> UInt32Array: ... -@overload -def nulls( - size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None -) -> UInt64Array: ... -@overload -def nulls( - size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None -) -> HalfFloatArray: ... -@overload -def nulls( - size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None -) -> FloatArray: ... -@overload -def nulls( - size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None -) -> DoubleArray: ... -@overload -def nulls( - size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def nulls( - size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None -) -> Decimal256Array: ... -@overload -def nulls( - size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None -) -> Date32Array: ... -@overload -def nulls( - size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None -) -> Date64Array: ... -@overload -def nulls( - size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None -) -> Time32Array[types._Time32Unit]: ... -@overload -def nulls( - size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None -) -> Time64Array[types._Time64Unit]: ... -@overload -def nulls( - size: int, - type: types.TimestampType[types._Unit, types._Tz], - memory_pool: MemoryPool | None = None, -) -> TimestampArray[types._Unit, types._Tz]: ... -@overload -def nulls( - size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None -) -> DurationArray[types._Unit]: ... -@overload -def nulls( - size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None -) -> MonthDayNanoIntervalArray: ... -@overload -def nulls( - size: int, - type: types.BinaryType, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def nulls( - size: int, - type: types.LargeBinaryType, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def nulls( - size: int, - type: types.FixedSizeBinaryType, - memory_pool: MemoryPool | None = None, -) -> FixedSizeBinaryArray: ... -@overload -def nulls( - size: int, - type: types.StringType, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def nulls( - size: int, - type: types.LargeStringType, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def nulls( - size: int, - type: types.BinaryViewType, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def nulls( - size: int, - type: types.StringViewType, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def nulls( - size: int, - type: types.LargeListType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> LargeListArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.ListViewType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> ListViewArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.LargeListViewType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> LargeListViewArray[_DataTypeT]: ... -@overload -def nulls( - size: int, - type: types.FixedSizeListType[_DataTypeT, _Size], - memory_pool: MemoryPool | None = None, -) -> FixedSizeListArray[_DataTypeT, _Size]: ... -@overload -def nulls( - size: int, - type: types.ListType[_DataTypeT], - memory_pool: MemoryPool | None = None, -) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... -@overload -def nulls( - size: int, - type: types.StructType, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def nulls( - size: int, - type: types.MapType[_MapKeyT, _MapItemT], - memory_pool: MemoryPool | None = None, -) -> MapArray[_MapKeyT, _MapItemT]: ... -@overload -def nulls( - size: int, - type: types.DictionaryType[_IndexT, _BasicValueT], - memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _BasicValueT]: ... -@overload -def nulls( - size: int, - type: types.RunEndEncodedType[_RunEndType, _BasicValueT], - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... -@overload -def nulls( - size: int, - type: types.UnionType, - memory_pool: MemoryPool | None = None, -) -> UnionArray: ... -@overload -def nulls( - size: int, - type: types.FixedShapeTensorType[types._ValueT], - memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray[Any]: ... -@overload -def nulls( - size: int, - type: types.Bool8Type, - memory_pool: MemoryPool | None = None, -) -> Bool8Array: ... -@overload -def nulls( - size: int, - type: types.UuidType, - memory_pool: MemoryPool | None = None, -) -> UuidArray[Any]: ... -@overload -def nulls( - size: int, - type: types.JsonType, - memory_pool: MemoryPool | None = None, -) -> JsonArray[Any]: ... -@overload -def nulls( - size: int, - type: types.OpaqueType, - memory_pool: MemoryPool | None = None, -) -> OpaqueArray[Any]: ... -@overload -def nulls( - size: int, - type: types.ExtensionType, - memory_pool: MemoryPool | None = None, -) -> ExtensionArray[Any]: ... -def nulls(*args, **kwargs): - """ - Create a strongly-typed Array instance with all elements null. - - Parameters - ---------- - size : int - Array length. - type : pyarrow.DataType, default None - Explicit type for the array. By default use NullType. - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - - Returns - ------- - arr : Array - - Examples - -------- - >>> import pyarrow as pa - >>> pa.nulls(10) - - 10 nulls - - >>> pa.nulls(3, pa.uint32()) - - [ - null, - null, - null - ] - """ - -@overload -def repeat( - value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None -) -> NullArray: ... -@overload -def repeat( # type: ignore[overload-overlap] - value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None -) -> BooleanArray: ... -@overload -def repeat( - value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int8Array: ... -@overload -def repeat( - value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int16Array: ... -@overload -def repeat( - value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int32Array: ... -@overload -def repeat( - value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Int64Array: ... -@overload -def repeat( - value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt8Array: ... -@overload -def repeat( - value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt16Array: ... -@overload -def repeat( - value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt32Array: ... -@overload -def repeat( - value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> UInt64Array: ... -@overload -def repeat( - value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None -) -> HalfFloatArray: ... -@overload -def repeat( - value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None -) -> FloatArray: ... -@overload -def repeat( - value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None -) -> DoubleArray: ... -@overload -def repeat( - value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal32Array: ... -@overload -def repeat( - value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal64Array: ... -@overload -def repeat( - value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal128Array: ... -@overload -def repeat( - value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Decimal256Array: ... -@overload -def repeat( - value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Date32Array: ... -@overload -def repeat( - value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None -) -> Date64Array: ... -@overload -def repeat( - value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None -) -> Time32Array[types._Time32Unit]: ... -@overload -def repeat( - value: dt.time | scalar.Time64Scalar[types._Time64Unit], - size: int, - memory_pool: MemoryPool | None = None, -) -> Time64Array[types._Time64Unit]: ... -@overload -def repeat( - value: scalar.TimestampScalar[types._Unit, types._Tz], - size: int, - memory_pool: MemoryPool | None = None, -) -> TimestampArray[types._Unit, types._Tz]: ... -@overload -def repeat( - value: dt.timedelta | scalar.DurationScalar[types._Unit], - size: int, - memory_pool: MemoryPool | None = None, -) -> DurationArray[types._Unit]: ... -@overload -def repeat( # pyright: ignore[reportOverlappingOverload] - value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalArray: ... -@overload -def repeat( - value: bytes | scalar.BinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> BinaryArray: ... -@overload -def repeat( - value: scalar.LargeBinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryArray: ... -@overload -def repeat( - value: scalar.FixedSizeBinaryScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedSizeBinaryArray: ... -@overload -def repeat( - value: str | scalar.StringScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StringArray: ... -@overload -def repeat( - value: scalar.LargeStringScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeStringArray: ... -@overload -def repeat( - value: scalar.BinaryViewScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> BinaryViewArray: ... -@overload -def repeat( - value: scalar.StringViewScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StringViewArray: ... -@overload -def repeat( - value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... -@overload -def repeat( - value: scalar.FixedSizeListScalar[_DataTypeT, _Size], - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedSizeListArray[_DataTypeT, _Size]: ... -@overload -def repeat( - value: scalar.LargeListScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeListArray[_DataTypeT]: ... -@overload -def repeat( - value: scalar.ListViewScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> ListViewArray[_DataTypeT]: ... -@overload -def repeat( - value: scalar.LargeListViewScalar[_DataTypeT], - size: int, - memory_pool: MemoryPool | None = None, -) -> LargeListViewArray[_DataTypeT]: ... -@overload -def repeat( - value: dict[str, Any] | scalar.StructScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> StructArray: ... -@overload -def repeat( - value: scalar.MapScalar[_MapKeyT, _MapItemT], - size: int, - memory_pool: MemoryPool | None = None, -) -> MapArray[_MapKeyT, _MapItemT]: ... -@overload -def repeat( - value: scalar.DictionaryScalar[_IndexT, _BasicValueT], - size: int, - memory_pool: MemoryPool | None = None, -) -> DictionaryArray[_IndexT, _BasicValueT]: ... -@overload -def repeat( - value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], - size: int, - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... -@overload -def repeat( - value: scalar.UnionScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> UnionArray: ... -@overload -def repeat( - value: scalar.FixedShapeTensorScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> FixedShapeTensorArray[Any]: ... -@overload -def repeat( - value: scalar.Bool8Scalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> Bool8Array: ... -@overload -def repeat( - value: scalar.UuidScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> UuidArray[Any]: ... -@overload -def repeat( - value: scalar.JsonScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> JsonArray[Any]: ... -@overload -def repeat( - value: scalar.OpaqueScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> OpaqueArray[Any]: ... -@overload -def repeat( - value: scalar.ExtensionScalar, - size: int, - memory_pool: MemoryPool | None = None, -) -> ExtensionArray[Any]: ... -def repeat(*args, **kwargs): - """ - Create an Array instance whose slots are the given scalar. - - Parameters - ---------- - value : Scalar-like object - Either a pyarrow.Scalar or any python object coercible to a Scalar. - size : int - Number of times to repeat the scalar in the output Array. - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - - Returns - ------- - arr : Array - - Examples - -------- - >>> import pyarrow as pa - >>> pa.repeat(10, 3) - - [ - 10, - 10, - 10 - ] - - >>> pa.repeat([1, 2], 2) - - [ - [ - 1, - 2 - ], - [ - 1, - 2 - ] - ] - - >>> pa.repeat("string", 3) - - [ - "string", - "string", - "string" - ] - - >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) - - -- is_valid: all not null - -- child 0 type: int64 - [ - 1, - 1 - ] - -- child 1 type: list - [ - [ - 1, - 2 - ], - [ - 1, - 2 - ] - ] - """ - -def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: - """ - Attempt to infer Arrow data type that can hold the passed Python - sequence type in an Array object - - Parameters - ---------- - values : array-like - Sequence to infer type from. - mask : ndarray (bool type), optional - Optional exclusion mask where True marks null, False non-null. - from_pandas : bool, default False - Use pandas's NA/null sentinel values for type inference. - - Returns - ------- - type : DataType - """ - -class ArrayStatistics(_Weakrefable): - """ - The class for statistics of an array. - """ - @property - def null_count(self) -> int: - """ - The number of nulls. - """ - @property - def distinct_count(self) -> int: - """ - The number of distinct values. - """ - @property - def min(self) -> Any: - """ - The minimum value. - """ - @property - def is_min_exact(self) -> bool: - """ - Whether the minimum value is an exact value or not. - """ - @property - def max(self) -> Any: - """ - The maximum value. - """ - - @property - def is_max_exact(self) -> bool: - """ - Whether the maximum value is an exact value or not. - """ - -_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) - -class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): - def to_pandas( - self, - memory_pool: MemoryPool | None = None, - categories: list | None = None, - strings_to_categorical: bool = False, - zero_copy_only: bool = False, - integer_object_nulls: bool = False, - date_as_object: bool = True, - timestamp_as_object: bool = False, - use_threads: bool = True, - deduplicate_objects: bool = True, - ignore_metadata: bool = False, - safe: bool = True, - split_blocks: bool = False, - self_destruct: bool = False, - maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, - types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, - coerce_temporal_nanoseconds: bool = False, - ) -> _ConvertAs: - """ - Convert to a pandas-compatible NumPy array or DataFrame, as appropriate - - Parameters - ---------- - memory_pool : MemoryPool, default None - Arrow MemoryPool to use for allocations. Uses the default memory - pool if not passed. - categories : list, default empty - List of fields that should be returned as pandas.Categorical. Only - applies to table-like data structures. - strings_to_categorical : bool, default False - Encode string (UTF8) and binary types to pandas.Categorical. - zero_copy_only : bool, default False - Raise an ArrowException if this function call would require copying - the underlying data. - integer_object_nulls : bool, default False - Cast integers with nulls to objects - date_as_object : bool, default True - Cast dates to objects. If False, convert to datetime64 dtype with - the equivalent time unit (if supported). Note: in pandas version - < 2.0, only datetime64[ns] conversion is supported. - timestamp_as_object : bool, default False - Cast non-nanosecond timestamps (np.datetime64) to objects. This is - useful in pandas version 1.x if you have timestamps that don't fit - in the normal date range of nanosecond timestamps (1678 CE-2262 CE). - Non-nanosecond timestamps are supported in pandas version 2.0. - If False, all timestamps are converted to datetime64 dtype. - use_threads : bool, default True - Whether to parallelize the conversion using multiple threads. - deduplicate_objects : bool, default True - Do not create multiple copies Python objects when created, to save - on memory use. Conversion will be slower. - ignore_metadata : bool, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - safe : bool, default True - For certain data types, a cast is needed in order to store the - data in a pandas DataFrame or Series (e.g. timestamps are always - stored as nanoseconds in pandas). This option controls whether it - is a safe cast or not. - split_blocks : bool, default False - If True, generate one internal "block" for each column when - creating a pandas.DataFrame from a RecordBatch or Table. While this - can temporarily reduce memory note that various pandas operations - can trigger "consolidation" which may balloon memory use. - self_destruct : bool, default False - EXPERIMENTAL: If True, attempt to deallocate the originating Arrow - memory while converting the Arrow object to pandas. If you use the - object after calling to_pandas with this option it will crash your - program. - - Note that you may not see always memory usage improvements. For - example, if multiple columns share an underlying allocation, - memory can't be freed until all columns are converted. - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. - types_mapper : function, default None - A function mapping a pyarrow DataType to a pandas ExtensionDtype. - This can be used to override the default pandas type for conversion - of built-in pyarrow types or in absence of pandas_metadata in the - Table schema. The function receives a pyarrow DataType and is - expected to return a pandas ExtensionDtype or ``None`` if the - default conversion should be used for that type. If you have - a dictionary mapping, you can pass ``dict.get`` as function. - coerce_temporal_nanoseconds : bool, default False - Only applicable to pandas version >= 2.0. - A legacy option to coerce date32, date64, duration, and timestamp - time units to nanoseconds when converting to pandas. This is the - default behavior in pandas version 1.x. Set this option to True if - you'd like to use this coercion when using pandas version >= 2.0 - for backwards compatibility (not recommended otherwise). - - Returns - ------- - pandas.Series or pandas.DataFrame depending on type of object - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - - Convert a Table to pandas DataFrame: - - >>> table = pa.table( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> table.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - >>> isinstance(table.to_pandas(), pd.DataFrame) - True - - Convert a RecordBatch to pandas DataFrame: - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) - >>> batch - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - >>> isinstance(batch.to_pandas(), pd.DataFrame) - True - - Convert a Chunked Array to pandas Series: - - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_pandas() - 0 2 - 1 2 - 2 4 - 3 4 - 4 5 - 5 100 - dtype: int64 - >>> isinstance(n_legs.to_pandas(), pd.Series) - True - """ - -_CastAs = TypeVar("_CastAs", bound=DataType) -_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) -_ScalarT = TypeVar("_ScalarT", bound=Scalar) - -class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): - """ - The base class for all Arrow arrays. - """ - - def diff(self, other: Self) -> str: - """ - Compare contents of this array against another one. - - Return a string containing the result of diffing this array - (on the left side) against the other array (on the right side). - - Parameters - ---------- - other : Array - The other array to compare this array with. - - Returns - ------- - diff : str - A human-readable printout of the differences. - - Examples - -------- - >>> import pyarrow as pa - >>> left = pa.array(["one", "two", "three"]) - >>> right = pa.array(["two", None, "two-and-a-half", "three"]) - >>> print(left.diff(right)) # doctest: +SKIP - - @@ -0, +0 @@ - -"one" - @@ -2, +1 @@ - +null - +"two-and-a-half" - """ - def cast( - self, - target_type: _CastAs, - safe: bool = True, - options: CastOptions | None = None, - memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_CastAs]]: - """ - Cast array values to another data type - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, default None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Returns - ------- - cast : Array - """ - def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: - """ - Return zero-copy "view" of array as another data type. - - The data types must have compatible columnar buffer layouts - - Parameters - ---------- - target_type : DataType - Type to construct view as. - - Returns - ------- - view : Array - """ - def sum(self, **kwargs) -> _Scalar_co: - """ - Sum the values in a numerical array. - - See :func:`pyarrow.compute.sum` for full usage. - - Parameters - ---------- - **kwargs : dict, optional - Options to pass to :func:`pyarrow.compute.sum`. - - Returns - ------- - sum : Scalar - A scalar containing the sum value. - """ - @property - def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... - def unique(self) -> Self: - """ - Compute distinct elements in array. - - Returns - ------- - unique : Array - An array of the same data type, with deduplicated elements. - """ - def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: - """ - Compute dictionary-encoded representation of array. - - See :func:`pyarrow.compute.dictionary_encode` for full usage. - - Parameters - ---------- - null_encoding : str, default "mask" - How to handle null entries. - - Returns - ------- - encoded : DictionaryArray - A dictionary-encoded version of this array. - """ - def value_count(self) -> StructArray: - """ - Compute counts of unique elements in array. - - Returns - ------- - StructArray - An array of structs - """ - @overload - @staticmethod - def from_pandas( - obj: pd.Series | np.ndarray | ArrayLike, - *, - mask: Mask | None = None, - type: _DataTypeT, - safe: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Array[Scalar[_DataTypeT]]: ... - @overload - @staticmethod - def from_pandas( - obj: pd.Series | np.ndarray | ArrayLike, - *, - mask: Mask | None = None, - safe: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Array[Scalar]: ... - @staticmethod - def from_pandas(*args, **kwargs): - """ - Convert pandas.Series to an Arrow Array. - - This method uses Pandas semantics about what values indicate - nulls. See pyarrow.array for more general conversion from arrays or - sequences to Arrow arrays. - - Parameters - ---------- - obj : ndarray, pandas.Series, array-like - mask : array (boolean), optional - Indicate which values are null (True) or not null (False). - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred - from the data. - safe : bool, default True - Check for overflows or other unsafe conversions. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Notes - ----- - Localized timestamps will currently be returned as UTC (pandas's native - representation). Timezone-naive data will be implicitly interpreted as - UTC. - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray - ChunkedArray is returned if object data overflows binary buffer. - """ - @staticmethod - def from_buffers( - type: _DataTypeT, - length: int, - buffers: list[Buffer], - null_count: int = -1, - offset=0, - children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, - ) -> Array[Scalar[_DataTypeT]]: - """ - Construct an Array from a sequence of buffers. - - The concrete type returned depends on the datatype. - - Parameters - ---------- - type : DataType - The value type of the array. - length : int - The number of values in the array. - buffers : List[Buffer] - The buffers backing this array. - null_count : int, default -1 - The number of null entries in the array. Negative value means that - the null count is not known. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - children : List[Array], default None - Nested type children with length matching type.num_fields. - - Returns - ------- - array : Array - """ - @property - def null_count(self) -> int: ... - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the array. - - In other words, the sum of bytes from all buffer - ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the array. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - """ - def __sizeof__(self) -> int: ... - def __iter__(self) -> Iterator[_Scalar_co]: ... - def to_string( - self, - *, - indent: int = 2, - top_level_indent: int = 0, - window: int = 10, - container_window: int = 2, - skip_new_lines: bool = False, - ) -> str: - """ - Render a "pretty-printed" string representation of the Array. - - Note: for data on a non-CPU device, the full array is copied to CPU - memory. - - Parameters - ---------- - indent : int, default 2 - How much to indent the internal items in the string to - the right, by default ``2``. - top_level_indent : int, default 0 - How much to indent right the entire content of the array, - by default ``0``. - window : int - How many primitive items to preview at the begin and end - of the array when the array is bigger than the window. - The other items will be ellipsed. - container_window : int - How many container items (such as a list in a list array) - to preview at the begin and end of the array when the array - is bigger than the window. - skip_new_lines : bool - If the array should be rendered as a single line of text - or if each element should be on its own line. - """ - format = to_string - def equals(self, other: Self) -> bool: ... - def __len__(self) -> int: ... - def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: - """ - Return BooleanArray indicating the null values. - - Parameters - ---------- - nan_is_null : bool (optional, default False) - Whether floating-point NaN values should also be considered null. - - Returns - ------- - array : boolean Array - """ - def is_nan(self) -> BooleanArray: - """ - Return BooleanArray indicating the NaN values. - - Returns - ------- - array : boolean Array - """ - def is_valid(self) -> BooleanArray: - """ - Return BooleanArray indicating the non-null values. - """ - def fill_null( - self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType - ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: - """ - See :func:`pyarrow.compute.fill_null` for usage. - - Parameters - ---------- - fill_value : any - The replacement value for null entries. - - Returns - ------- - result : Array - A new array with nulls replaced by the given value. - """ - @overload - def __getitem__(self, key: int) -> _Scalar_co: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - def __getitem__(self, key): - """ - Slice or return value at given index - - Parameters - ---------- - key : integer or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - value : Scalar (index) or Array (slice) - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this array. - - Parameters - ---------- - offset : int, default 0 - Offset from start of array to slice. - length : int, default None - Length of slice (default is until end of Array starting from - offset). - - Returns - ------- - sliced : Array - An array with the same datatype, containing the sliced values. - """ - def take(self, indices: Indices) -> Self: - """ - Select values from an array. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the array whose values will be returned. - - Returns - ------- - taken : Array - An array with the same datatype, containing the taken values. - """ - def drop_null(self) -> Self: - """ - Remove missing values from an array. - """ - def filter( - self, - mask: Mask, - *, - null_selection_behavior: Literal["drop", "emit_null"] = "drop", - ) -> Self: - """ - Select values from an array. - - See :func:`pyarrow.compute.filter` for full usage. - - Parameters - ---------- - mask : Array or array-like - The boolean mask to filter the array with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled. - - Returns - ------- - filtered : Array - An array of the same type, with only the elements selected by - the boolean mask. - """ - @overload - def index( - self: Array[_ScalarT], - value: _ScalarT, - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> scalar.Int64Scalar: ... - @overload - def index( - self: Array[Scalar[_BasicDataType[_AsPyType]]], - value: _AsPyType, - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> scalar.Int64Scalar: ... - def index(self, *args, **kwargs): - """ - Find the first index of a value. - - See :func:`pyarrow.compute.index` for full usage. - - Parameters - ---------- - value : Scalar or object - The value to look for in the array. - start : int, optional - The start index where to look for `value`. - end : int, optional - The end index where to look for `value`. - memory_pool : MemoryPool, optional - A memory pool for potential memory allocations. - - Returns - ------- - index : Int64Scalar - The index of the value in the array (-1 if not found). - """ - def sort(self, order: Order = "ascending", **kwargs) -> Self: - """ - Sort the Array - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : Array - """ - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: - """ - Return a NumPy view or copy of this array. - - By default, tries to return a view of this array. This is only - supported for primitive arrays with the same memory layout as NumPy - (i.e. integers, floating point, ..) and without any nulls. - - For the extension arrays, this method simply delegates to the - underlying storage array. - - Parameters - ---------- - zero_copy_only : bool, default True - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable : bool, default False - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - - Returns - ------- - array : numpy.ndarray - """ - def to_pylist( - self: Array[Scalar[_BasicDataType[_AsPyType]]], - *, - map_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: - """ - Convert to a list of native Python objects. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - lst : list - """ - tolist = to_pylist - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - @property - def offset(self) -> int: - """ - A relative position into another array's data. - - The purpose is to enable zero-copy slicing. This value defaults to zero - but must be applied on all operations with the physical storage - buffers. - """ - def buffers(self) -> list[Buffer | None]: - """ - Return a list of Buffer objects pointing to this array's physical - storage. - - To correctly interpret these buffers, you need to also apply the offset - multiplied with the size of the stored data type. - """ - def copy_to(self, destination: MemoryManager | Device) -> Self: - """ - Construct a copy of the array with all buffers on destination - device. - - This method recursively copies the array's buffers and those of its - children onto the destination MemoryManager device and returns the - new Array. - - Parameters - ---------- - destination : pyarrow.MemoryManager or pyarrow.Device - The destination device to copy the array to. - - Returns - ------- - Array - """ - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the array type - is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: - """ - Import Array from a C ArrowArray struct, given its pointer - and the imported array type. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArray struct. - type: DataType or int - Either a DataType object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_array__(self, requested_schema=None) -> Any: - """ - Get a pair of PyCapsules containing a C ArrowArray representation of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the array to this data type. - If None, the array will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowArray, - respectively. - """ - @classmethod - def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowDeviceArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the array type - is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: - """ - Import Array from a C ArrowDeviceArray struct, given its pointer - and the imported array type. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - type: DataType or int - Either a DataType object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - - def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: - """ - Get a pair of PyCapsules containing a C ArrowDeviceArray representation - of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the array to this data type. - If None, the array will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - kwargs - Currently no additional keyword arguments are supported, but - this method will accept any keyword with a value of ``None`` - for compatibility with future keywords. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, - respectively. - """ - @classmethod - def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... - def __dlpack__(self, stream: int | None = None) -> Any: - """Export a primitive array as a DLPack capsule. - - Parameters - ---------- - stream : int, optional - A Python integer representing a pointer to a stream. Currently not supported. - Stream is provided by the consumer to the producer to instruct the producer - to ensure that operations can safely be performed on the array. - - Returns - ------- - capsule : PyCapsule - A DLPack capsule for the array, pointing to a DLManagedTensor. - """ - def __dlpack_device__(self) -> tuple[int, int]: - """ - Return the DLPack device tuple this arrays resides on. - - Returns - ------- - tuple : Tuple[int, int] - Tuple with index specifying the type of the device (where - CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the - device which is 0 by default for CPU. - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the array resides. - - Returns - ------- - DeviceAllocationType - """ - - @property - def is_cpu(self) -> bool: - """ - Whether the array is CPU-accessible. - """ - @property - def statistics(self) -> ArrayStatistics | None: - """ - Statistics of the array. - """ - -class NullArray(Array[scalar.NullScalar]): ... - -class BooleanArray(Array[scalar.BooleanScalar]): - @property - def false_count(self) -> int: ... - @property - def true_count(self) -> int: ... - -class NumericArray(Array[_ScalarT]): ... -class IntegerArray(NumericArray[_ScalarT]): ... -class FloatingPointArray(NumericArray[_ScalarT]): ... -class Int8Array(IntegerArray[scalar.Int8Scalar]): ... -class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... -class Int16Array(IntegerArray[scalar.Int16Scalar]): ... -class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... -class Int32Array(IntegerArray[scalar.Int32Scalar]): ... -class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... -class Int64Array(IntegerArray[scalar.Int64Scalar]): ... -class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... -class Date32Array(NumericArray[scalar.Date32Scalar]): ... -class Date64Array(NumericArray[scalar.Date64Scalar]): ... -class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... -class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... -class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... -class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... -class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... -class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... -class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... -class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... -class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... -class Decimal32Array(FixedSizeBinaryArray): ... -class Decimal64Array(FixedSizeBinaryArray): ... -class Decimal128Array(FixedSizeBinaryArray): ... -class Decimal256Array(FixedSizeBinaryArray): ... - -class BaseListArray(Array[_ScalarT]): - def flatten(self, recursive: bool = False) -> Array: ... - def value_parent_indices(self) -> Int64Array: ... - def value_lengths(self) -> Int32Array: ... - -class ListArray(BaseListArray[_ScalarT]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[int], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[float], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[str], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.StringType]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list[bytes], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: list, - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array | list[int], - values: Array | list, - *, - type: _DataTypeT, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): - """ - Construct ListArray from arrays of int32 offsets and values. - - Parameters - ---------- - offsets : Array (int32 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_array : ListArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 2, 4]) - >>> pa.ListArray.from_arrays(offsets, values) - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - >>> # nulls in the offsets array become null lists - >>> offsets = pa.array([0, None, 2, 4]) - >>> pa.ListArray.from_arrays(offsets, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ - @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the ListArray - ignoring the array's offset. - - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. - - Returns - ------- - values : Array - - See Also - -------- - ListArray.flatten : ... - - Examples - -------- - - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) - >>> array.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - - If an array is sliced, the slice still uses the same - underlying data as the original array, just with an - offset. Since values ignores the offset, the values are the - same: - - >>> sliced = array.slice(1, 2) - >>> sliced - - [ - null, - [ - 3, - 4, - null, - 6 - ] - ] - >>> sliced.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - - """ - @property - def offsets(self) -> Int32Array: - """ - Return the list offsets as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - offsets : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, 4, 5]]) - >>> array.offsets - - [ - 0, - 2, - 2, - 5 - ] - """ - -class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListArray[_DataTypeT]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array, - *, - type: _DataTypeT, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): - """ - Construct LargeListArray from arrays of int64 offsets and values. - - Parameters - ---------- - offsets : Array (int64 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_array : LargeListArray - """ - @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the LargeListArray - ignoring the array's offset. - - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. - - Returns - ------- - values : Array - - See Also - -------- - LargeListArray.flatten : ... - - Examples - -------- - - The values include null elements from the sub-lists: - - >>> import pyarrow as pa - >>> array = pa.array( - ... [[1, 2], None, [3, 4, None, 6]], - ... type=pa.large_list(pa.int32()), - ... ) - >>> array.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - - If an array is sliced, the slice still uses the same - underlying data as the original array, just with an - offset. Since values ignores the offset, the values are the - same: - - >>> sliced = array.slice(1, 2) - >>> sliced - - [ - null, - [ - 3, - 4, - null, - 6 - ] - ] - >>> sliced.values - - [ - 1, - 2, - 3, - 4, - null, - 6 - ] - """ - @property - def offsets(self) -> Int64Array: - """ - Return the list offsets as an int64 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListArray.from_arrays` and get back the - same list array if the original one has nulls. - - Returns - ------- - offsets : Int64Array - """ - -class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListViewArray[_DataTypeT]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int32Array, - values: Array, - *, - type: _DataTypeT, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> ListViewArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): - """ - Construct ListViewArray from arrays of int32 offsets, sizes, and values. - - Parameters - ---------- - offsets : Array (int32 type) - sizes : Array (int32 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_view_array : ListViewArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 1, 2]) - >>> sizes = pa.array([2, 2, 2]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - [ - 2, - 3 - ], - [ - 3, - 4 - ] - ] - >>> # use a null mask to represent null values - >>> mask = pa.array([False, True, False]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - >>> # null values can be defined in either offsets or sizes arrays - >>> # WARNING: this will result in a copy of the offsets or sizes arrays - >>> offsets = pa.array([0, None, 2]) - >>> pa.ListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ - @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the ListViewArray - ignoring the array's offset and sizes. - - The values array may be out of order and/or contain additional values - that are not found in the logical representation of the array. The only - guarantee is that each non-null value in the ListView Array is contiguous. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's order and offset. - - Returns - ------- - values : Array - - Examples - -------- - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 1, - 2 - ], - [], - [ - 2, - null, - 3, - 4 - ] - ] - >>> array.values - - [ - 1, - 2, - null, - 3, - 4 - ] - """ - @property - def offsets(self) -> Int32Array: - """ - Return the list offsets as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListViewArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - offsets : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets - - [ - 0, - 0, - 1 - ] - """ - @property - def sizes(self) -> Int32Array: - """ - Return the list sizes as an int32 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `ListViewArray.from_arrays` and get back the same - list array if the original one has nulls. - - Returns - ------- - sizes : Int32Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array.sizes - - [ - 2, - 0, - 4 - ] - """ - -class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListViewArray[_DataTypeT]: ... - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - values: Array, - *, - type: _DataTypeT, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> LargeListViewArray[_DataTypeT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): - """ - Construct LargeListViewArray from arrays of int64 offsets and values. - - Parameters - ---------- - offsets : Array (int64 type) - sizes : Array (int64 type) - values : Array (any type) - type : DataType, optional - If not specified, a default ListType with the values' type is - used. - pool : MemoryPool, optional - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - list_view_array : LargeListViewArray - - Examples - -------- - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> offsets = pa.array([0, 1, 2]) - >>> sizes = pa.array([2, 2, 2]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - [ - 2, - 3 - ], - [ - 3, - 4 - ] - ] - >>> # use a null mask to represent null values - >>> mask = pa.array([False, True, False]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - >>> # null values can be defined in either offsets or sizes arrays - >>> # WARNING: this will result in a copy of the offsets or sizes arrays - >>> offsets = pa.array([0, None, 2]) - >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) - - [ - [ - 1, - 2 - ], - null, - [ - 3, - 4 - ] - ] - """ - @property - def values(self) -> Array: - """ - Return the underlying array of values which backs the LargeListArray - ignoring the array's offset. - - The values array may be out of order and/or contain additional values - that are not found in the logical representation of the array. The only - guarantee is that each non-null value in the ListView Array is contiguous. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's order and offset. - - Returns - ------- - values : Array - - See Also - -------- - LargeListArray.flatten : ... - - Examples - -------- - - The values include null elements from sub-lists: - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 1, - 2 - ], - [], - [ - 2, - null, - 3, - 4 - ] - ] - >>> array.values - - [ - 1, - 2, - null, - 3, - 4 - ] - """ - @property - def offsets(self) -> Int64Array: - """ - Return the list view offsets as an int64 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListViewArray.from_arrays` and get back the - same list array if the original one has nulls. - - Returns - ------- - offsets : Int64Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets - - [ - 0, - 0, - 1 - ] - """ - @property - def sizes(self) -> Int64Array: - """ - Return the list view sizes as an int64 array. - - The returned array will not have a validity bitmap, so you cannot - expect to pass it to `LargeListViewArray.from_arrays` and get back the - same list array if the original one has nulls. - - Returns - ------- - sizes : Int64Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, None, 3, 4] - >>> offsets = [0, 0, 1] - >>> sizes = [2, 0, 4] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array.sizes - - [ - 2, - 0, - 4 - ] - """ - -class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): - @overload - @classmethod - def from_arrays( - cls, - values: Array[Scalar[_DataTypeT]], - *, - type: None = None, - mask: Mask | None = None, - ) -> FixedSizeListArray[_DataTypeT, None]: ... - @overload - @classmethod - def from_arrays( - cls, - values: Array[Scalar[_DataTypeT]], - limit_size: _Size, - *, - type: None = None, - mask: Mask | None = None, - ) -> FixedSizeListArray[_DataTypeT, _Size]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): - """ - Construct FixedSizeListArray from array of values and a list length. - - Parameters - ---------- - values : Array (any type) - list_size : int - The fixed length of the lists. - type : DataType, optional - If not specified, a default ListType with the values' type and - `list_size` length is used. - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - - Returns - ------- - FixedSizeListArray - - Examples - -------- - - Create from a values array and a list size: - - >>> import pyarrow as pa - >>> values = pa.array([1, 2, 3, 4]) - >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) - >>> arr - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - - Or create from a values array, list size and matching type: - - >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) - >>> arr - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - """ - @property - def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: - """ - Return the underlying array of values which backs the - FixedSizeListArray. - - Note even null elements are included. - - Compare with :meth:`flatten`, which returns only the non-null - sub-list values. - - Returns - ------- - values : Array - - See Also - -------- - FixedSizeListArray.flatten : ... - - Examples - -------- - >>> import pyarrow as pa - >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) - >>> array.values - - [ - 1, - 2, - null, - null, - 3, - null - ] - - """ - -_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) -_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) - -class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): - @overload - @classmethod - def from_arrays( - cls, - offsets: Int64Array, - keys: Array[Scalar[_MapKeyT]], - items: Array[Scalar[_MapItemT]], - *, - type: None = None, - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> MapArray[_MapKeyT, _MapItemT]: ... - @overload - @classmethod - def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] - cls, - offsets: Int64Array, - values: Array, - *, - type: MapType[_MapKeyT, _MapItemT], - pool: MemoryPool | None = None, - mask: Mask | None = None, - ) -> MapArray[_MapKeyT, _MapItemT]: ... - @classmethod - def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] - """ - Construct MapArray from arrays of int32 offsets and key, item arrays. - - Parameters - ---------- - offsets : array-like or sequence (int32 type) - keys : array-like or sequence (any type) - items : array-like or sequence (any type) - type : DataType, optional - If not specified, a default MapArray with the keys' and items' type is used. - pool : MemoryPool - mask : Array (boolean type), optional - Indicate which values are null (True) or not null (False). - - Returns - ------- - map_array : MapArray - - Examples - -------- - First, let's understand the structure of our dataset when viewed in a rectangular data model. - The total of 5 respondents answered the question "How much did you like the movie x?". - The value -1 in the integer array means that the value is missing. The boolean array - represents the null bitmask corresponding to the missing values in the integer array. - - >>> import pyarrow as pa - >>> movies_rectangular = np.ma.masked_array( - ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], - ... [ - ... [False, True, True], - ... [False, False, False], - ... [True, False, False], - ... [True, True, True], - ... [True, True, True], - ... ], - ... ) - - To represent the same data with the MapArray and from_arrays, the data is - formed like this: - - >>> offsets = [ - ... 0, # -- row 1 start - ... 1, # -- row 2 start - ... 4, # -- row 3 start - ... 6, # -- row 4 start - ... 6, # -- row 5 start - ... 6, # -- row 5 end - ... ] - >>> movies = [ - ... "Dark Knight", # ---------------------------------- row 1 - ... "Dark Knight", - ... "Meet the Parents", - ... "Superman", # -- row 2 - ... "Meet the Parents", - ... "Superman", # ----------------- row 3 - ... ] - >>> likings = [ - ... 10, # -------- row 1 - ... 8, - ... 4, - ... 5, # --- row 2 - ... 10, - ... 3, # ------ row 3 - ... ] - >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() - 0 [(Dark Knight, 10)] - 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... - 2 [(Meet the Parents, 10), (Superman, 3)] - 3 [] - 4 [] - dtype: object - - If the data in the empty rows needs to be marked as missing, it's possible - to do so by modifying the offsets argument, so that we specify `None` as - the starting positions of the rows we want marked as missing. The end row - offset still has to refer to the existing value from keys (and values): - - >>> offsets = [ - ... 0, # ----- row 1 start - ... 1, # ----- row 2 start - ... 4, # ----- row 3 start - ... None, # -- row 4 start - ... None, # -- row 5 start - ... 6, # ----- row 5 end - ... ] - >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() - 0 [(Dark Knight, 10)] - 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... - 2 [(Meet the Parents, 10), (Superman, 3)] - 3 None - 4 None - dtype: object - """ - @property - def keys(self) -> Array: - """Flattened array of keys across all maps in array""" - @property - def items(self) -> Array: - """Flattened array of items across all maps in array""" - -class UnionArray(Array[scalar.UnionScalar]): - @deprecated("Use fields() instead") - def child(self, pos: int) -> Field: - """ - DEPRECATED, use field() instead. - - Parameters - ---------- - pos : int - The physical index of the union child field (not its type code). - - Returns - ------- - field : pyarrow.Field - The given child field. - """ - def field(self, pos: int) -> Array: - """ - Return the given child field as an individual array. - - For sparse unions, the returned array has its offset, length, - and null count adjusted. - - For dense unions, the returned array is unchanged. - - Parameters - ---------- - pos : int - The physical index of the union child field (not its type code). - - Returns - ------- - field : Array - The given child field. - """ - @property - def type_codes(self) -> Int8Array: - """Get the type codes array.""" - @property - def offsets(self) -> Int32Array: - """ - Get the value offsets array (dense arrays only). - - Does not account for any slice offset. - """ - @staticmethod - def from_dense( - type: Int8Array, - value_offsets: Int32Array, - children: NullableCollection[Array], - field_names: list[str] | None = None, - type_codes: Int8Array | None = None, - ) -> UnionArray: - """ - Construct dense UnionArray from arrays of int8 types, int32 offsets and - children arrays - - Parameters - ---------- - types : Array (int8 type) - value_offsets : Array (int32 type) - children : list - field_names : list - type_codes : list - - Returns - ------- - union_array : UnionArray - """ - @staticmethod - def from_sparse( - types: Int8Array, - children: NullableCollection[Array], - field_names: list[str] | None = None, - type_codes: Int8Array | None = None, - ) -> UnionArray: - """ - Construct sparse UnionArray from arrays of int8 types and children - arrays - - Parameters - ---------- - types : Array (int8 type) - children : list - field_names : list - type_codes : list - - Returns - ------- - union_array : UnionArray - """ - -class StringArray(Array[scalar.StringScalar]): - @staticmethod - def from_buffers( # type: ignore[override] - length: int, - value_offsets: Buffer, - data: Buffer, - null_bitmap: Buffer | None = None, - null_count: int | None = -1, - offset: int | None = 0, - ) -> StringArray: - """ - Construct a StringArray from value_offsets and data buffers. - If there are nulls in the data, also a null_bitmap and the matching - null_count must be passed. - - Parameters - ---------- - length : int - value_offsets : Buffer - data : Buffer - null_bitmap : Buffer, optional - null_count : int, default 0 - offset : int, default 0 - - Returns - ------- - string_array : StringArray - """ - -class LargeStringArray(Array[scalar.LargeStringScalar]): - @staticmethod - def from_buffers( # type: ignore[override] - length: int, - value_offsets: Buffer, - data: Buffer, - null_bitmap: Buffer | None = None, - null_count: int | None = -1, - offset: int | None = 0, - ) -> StringArray: - """ - Construct a LargeStringArray from value_offsets and data buffers. - If there are nulls in the data, also a null_bitmap and the matching - null_count must be passed. - - Parameters - ---------- - length : int - value_offsets : Buffer - data : Buffer - null_bitmap : Buffer, optional - null_count : int, default 0 - offset : int, default 0 - - Returns - ------- - string_array : StringArray - """ - -class StringViewArray(Array[scalar.StringViewScalar]): ... - -class BinaryArray(Array[scalar.BinaryScalar]): - @property - def total_values_length(self) -> int: - """ - The number of bytes from beginning to end of the data buffer addressed - by the offsets of this BinaryArray. - """ - -class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): - @property - def total_values_length(self) -> int: - """ - The number of bytes from beginning to end of the data buffer addressed - by the offsets of this LargeBinaryArray. - """ - -class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... - -class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): - def dictionary_encode(self) -> Self: ... # type: ignore[override] - def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: - """ - Decodes the DictionaryArray to an Array. - """ - @property - def indices(self) -> Array[Scalar[_IndexT]]: ... - @property - def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... - @staticmethod - def from_buffers( # type: ignore[override] - type: _BasicValueT, - length: int, - buffers: list[Buffer], - dictionary: Array | np.ndarray | pd.Series, - null_count: int = -1, - offset: int = 0, - ) -> DictionaryArray[Any, _BasicValueT]: - """ - Construct a DictionaryArray from buffers. - - Parameters - ---------- - type : pyarrow.DataType - length : int - The number of values in the array. - buffers : List[Buffer] - The buffers backing the indices array. - dictionary : pyarrow.Array, ndarray or pandas.Series - The array of values referenced by the indices. - null_count : int, default -1 - The number of null entries in the indices array. Negative value means that - the null count is not known. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - - Returns - ------- - dict_array : DictionaryArray - """ - @staticmethod - def from_arrays( - indices: Indices, - dictionary: Array | np.ndarray | pd.Series, - mask: np.ndarray | pd.Series | BooleanArray | None = None, - ordered: bool = False, - from_pandas: bool = False, - safe: bool = True, - memory_pool: MemoryPool | None = None, - ) -> DictionaryArray: - """ - Construct a DictionaryArray from indices and values. - - Parameters - ---------- - indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type - Non-negative integers referencing the dictionary values by zero - based index. - dictionary : pyarrow.Array, ndarray or pandas.Series - The array of values referenced by the indices. - mask : ndarray or pandas.Series, bool type - True values indicate that indices are actually null. - ordered : bool, default False - Set to True if the category values are ordered. - from_pandas : bool, default False - If True, the indices should be treated as though they originated in - a pandas.Categorical (null encoded as -1). - safe : bool, default True - If True, check that the dictionary indices are in range. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise uses default pool. - - Returns - ------- - dict_array : DictionaryArray - """ - -class StructArray(Array[scalar.StructScalar]): - def field(self, index: int | str) -> Array: - """ - Retrieves the child array belonging to field. - - Parameters - ---------- - index : Union[int, str] - Index / position or name of the field. - - Returns - ------- - result : Array - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: - """ - Return one individual array for each field in the struct. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Returns - ------- - result : List[Array] - """ - @staticmethod - def from_arrays( - arrays: Iterable[Array], - names: list[str] | None = None, - fields: list[Field] | None = None, - mask=None, - memory_pool: MemoryPool | None = None, - type: types.StructType | None = None, - ) -> StructArray: - """ - Construct StructArray from collection of arrays representing - each field in the struct. - - Either field names, field instances or a struct type must be passed. - - Parameters - ---------- - arrays : sequence of Array - names : List[str] (optional) - Field names for each struct child. - fields : List[Field] (optional) - Field instances for each struct child. - mask : pyarrow.Array[bool] (optional) - Indicate which values are null (True) or not null (False). - memory_pool : MemoryPool (optional) - For memory allocations, if required, otherwise uses default pool. - type : pyarrow.StructType (optional) - Struct type for name and type of each child. - - Returns - ------- - result : StructArray - """ - def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: - """ - Sort the StructArray - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - by : str or None, default None - If to sort the array by one of its fields - or by the whole array. - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : StructArray - """ - -class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): - @overload - @staticmethod - def from_arrays( - run_ends: Int16Array, - values: Array, - type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... - @overload - @staticmethod - def from_arrays( - run_ends: Int32Array, - values: Array, - type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... - @overload - @staticmethod - def from_arrays( - run_ends: Int64Array, - values: Array, - type: DataType | None = None, - ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... - @staticmethod - def from_arrays(*args, **kwargs): - """ - Construct RunEndEncodedArray from run_ends and values arrays. - - Parameters - ---------- - run_ends : Array (int16, int32, or int64 type) - The run_ends array. - values : Array (any type) - The values array. - type : pyarrow.DataType, optional - The run_end_encoded(run_end_type, value_type) array type. - - Returns - ------- - RunEndEncodedArray - """ - @staticmethod - def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] - type: DataType, - length: int, - buffers: list[Buffer], - null_count: int = -1, - offset=0, - children: tuple[Array, Array] | None = None, - ) -> RunEndEncodedArray[Any, _BasicValueT]: - """ - Construct a RunEndEncodedArray from all the parameters that make up an - Array. - - RunEndEncodedArrays do not have buffers, only children arrays, but this - implementation is needed to satisfy the Array interface. - - Parameters - ---------- - type : DataType - The run_end_encoded(run_end_type, value_type) type. - length : int - The logical length of the run-end encoded array. Expected to match - the last value of the run_ends array (children[0]) minus the offset. - buffers : List[Buffer] - Empty List or [None]. - null_count : int, default -1 - The number of null entries in the array. Run-end encoded arrays - are specified to not have valid bits and null_count always equals 0. - offset : int, default 0 - The array's logical offset (in values, not in bytes) from the - start of each buffer. - children : List[Array] - Nested type children containing the run_ends and values arrays. - - Returns - ------- - RunEndEncodedArray - """ - @property - def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: - """ - An array holding the logical indexes of each run-end. - - The physical offset to the array is applied. - """ - @property - def values(self) -> Array[scalar.Scalar[_BasicValueT]]: - """ - An array holding the values of each run. - - The physical offset to the array is applied. - """ - def find_physical_offset(self) -> int: - """ - Find the physical offset of this REE array. - - This is the offset of the run that contains the value of the first - logical element of this array considering its offset. - - This function uses binary-search, so it has a O(log N) cost. - """ - def find_physical_length(self) -> int: - """ - Find the physical length of this REE array. - - The physical length of an REE is the number of physical values (and - run-ends) necessary to represent the logical range of values from offset - to length. - - This function uses binary-search, so it has a O(log N) cost. - """ - -_ArrayT = TypeVar("_ArrayT", bound=Array) - -class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): - @property - def storage(self) -> Any: ... - @staticmethod - def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: - """ - Construct ExtensionArray from type and storage array. - - Parameters - ---------- - typ : DataType - The extension type for the result array. - storage : Array - The underlying storage for the result array. - - Returns - ------- - ext_array : ExtensionArray - """ - -class JsonArray(ExtensionArray[_ArrayT]): - """ - Concrete class for Arrow arrays of JSON data type. - - This does not guarantee that the JSON data actually - is valid JSON. - - Examples - -------- - Define the extension type for JSON array - - >>> import pyarrow as pa - >>> json_type = pa.json_(pa.large_utf8()) - - Create an extension array - - >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] - >>> storage = pa.array(arr, pa.large_utf8()) - >>> pa.ExtensionArray.from_storage(json_type, storage) - - [ - null, - "{ "id":30, "values":["a", "b"] }" - ] - """ - -class UuidArray(ExtensionArray[_ArrayT]): ... - -class FixedShapeTensorArray(ExtensionArray[_ArrayT]): - """ - Concrete class for fixed shape tensor extension arrays. - - Examples - -------- - Define the extension type for tensor array - - >>> import pyarrow as pa - >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) - - Create an extension array - - >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] - >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) - >>> pa.ExtensionArray.from_storage(tensor_type, storage) - - [ - [ - 1, - 2, - 3, - 4 - ], - [ - 10, - 20, - 30, - 40 - ], - [ - 100, - 200, - 300, - 400 - ] - ] - """ - - def to_numpy_ndarray(self) -> np.ndarray: - """ - Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. - - The resulting ndarray will have (ndim + 1) dimensions. - The size of the first dimension will be the length of the fixed shape tensor array - and the rest of the dimensions will match the permuted shape of the fixed - shape tensor. - - The conversion is zero-copy. - - Returns - ------- - numpy.ndarray - Ndarray representing tensors in the fixed shape tensor array concatenated - along the first dimension. - """ - def to_tensor(self) -> Tensor: - """ - Convert fixed shape tensor extension array to a pyarrow.Tensor. - - The resulting Tensor will have (ndim + 1) dimensions. - The size of the first dimension will be the length of the fixed shape tensor array - and the rest of the dimensions will match the permuted shape of the fixed - shape tensor. - - The conversion is zero-copy. - - Returns - ------- - pyarrow.Tensor - Tensor representing tensors in the fixed shape tensor array concatenated - along the first dimension. - """ - - @classmethod - def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: - """ - Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. - The first dimension of ndarray will become the length of the fixed - shape tensor array. - If input array data is not contiguous a copy will be made. - - Parameters - ---------- - obj : numpy.ndarray - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - - [ - [ - 1, - 2, - 3, - 4, - 5, - 6 - ], - [ - 1, - 2, - 3, - 4, - 5, - 6 - ] - ] - """ - -class OpaqueArray(ExtensionArray[_ArrayT]): - """ - Concrete class for opaque extension arrays. - - Examples - -------- - Define the extension type for an opaque array - - >>> import pyarrow as pa - >>> opaque_type = pa.opaque( - ... pa.binary(), - ... type_name="geometry", - ... vendor_name="postgis", - ... ) - - Create an extension array - - >>> arr = [None, b"data"] - >>> storage = pa.array(arr, pa.binary()) - >>> pa.ExtensionArray.from_storage(opaque_type, storage) - - [ - null, - 64617461 - ] - """ - -class Bool8Array(ExtensionArray): - """ - Concrete class for bool8 extension arrays. - - Examples - -------- - Define the extension type for an bool8 array - - >>> import pyarrow as pa - >>> bool8_type = pa.bool8() - - Create an extension array - - >>> arr = [-1, 0, 1, 2, None] - >>> storage = pa.array(arr, pa.int8()) - >>> pa.ExtensionArray.from_storage(bool8_type, storage) - - [ - -1, - 0, - 1, - 2, - null - ] - """ - - def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: - """ - Return a NumPy bool view or copy of this array. - - By default, tries to return a view of this array. This is only - supported for arrays without any nulls. - - Parameters - ---------- - zero_copy_only : bool, default True - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls). - writable : bool, default False - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - - Returns - ------- - array : numpy.ndarray - """ - @classmethod - def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] - """ - Construct Bool8Array from Int8Array storage. - - Parameters - ---------- - storage : Int8Array - The underlying storage for the result array. - - Returns - ------- - bool8_array : Bool8Array - """ - @classmethod - def from_numpy(cls, obj: np.ndarray) -> Self: - """ - Convert numpy array to a bool8 extension array without making a copy. - The input array must be 1-dimensional, with either bool_ or int8 dtype. - - Parameters - ---------- - obj : numpy.ndarray - - Returns - ------- - bool8_array : Bool8Array - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = np.array([True, False, True], dtype=np.bool_) - >>> pa.Bool8Array.from_numpy(arr) - - [ - 1, - 0, - 1 - ] - """ - -def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: - """ - Concatenate the given arrays. - - The contents of the input arrays are copied into the returned array. - - Raises - ------ - ArrowInvalid - If not all of the arrays have the same type. - - Parameters - ---------- - arrays : iterable of pyarrow.Array - Arrays to concatenate, must be identically typed. - memory_pool : MemoryPool, default None - For memory allocations. If None, the default pool is used. - - Examples - -------- - >>> import pyarrow as pa - >>> arr1 = pa.array([2, 4, 5, 100]) - >>> arr2 = pa.array([2, 4]) - >>> pa.concat_arrays([arr1, arr2]) - - [ - 2, - 4, - 5, - 100, - 2, - 4 - ] - - """ - -def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: - """ - Create empty array of the given type. - """ - -__all__ = [ - "array", - "asarray", - "nulls", - "repeat", - "infer_type", - "_PandasConvertible", - "Array", - "NullArray", - "BooleanArray", - "NumericArray", - "IntegerArray", - "FloatingPointArray", - "Int8Array", - "UInt8Array", - "Int16Array", - "UInt16Array", - "Int32Array", - "UInt32Array", - "Int64Array", - "UInt64Array", - "Date32Array", - "Date64Array", - "TimestampArray", - "Time32Array", - "Time64Array", - "DurationArray", - "MonthDayNanoIntervalArray", - "HalfFloatArray", - "FloatArray", - "DoubleArray", - "FixedSizeBinaryArray", - "Decimal32Array", - "Decimal64Array", - "Decimal128Array", - "Decimal256Array", - "BaseListArray", - "ListArray", - "LargeListArray", - "ListViewArray", - "LargeListViewArray", - "FixedSizeListArray", - "MapArray", - "UnionArray", - "StringArray", - "LargeStringArray", - "StringViewArray", - "BinaryArray", - "LargeBinaryArray", - "BinaryViewArray", - "DictionaryArray", - "StructArray", - "RunEndEncodedArray", - "ExtensionArray", - "Bool8Array", - "UuidArray", - "JsonArray", - "OpaqueArray", - "FixedShapeTensorArray", - "concat_arrays", - "_empty_array", -] diff --git a/pyarrow-stubs/__lib_pxi/benchmark.pyi b/pyarrow-stubs/__lib_pxi/benchmark.pyi deleted file mode 100644 index 66981bf0f51..00000000000 --- a/pyarrow-stubs/__lib_pxi/benchmark.pyi +++ /dev/null @@ -1 +0,0 @@ -def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/pyarrow-stubs/__lib_pxi/builder.pyi b/pyarrow-stubs/__lib_pxi/builder.pyi deleted file mode 100644 index 4a0e9ca4708..00000000000 --- a/pyarrow-stubs/__lib_pxi/builder.pyi +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Iterable - -from pyarrow.lib import MemoryPool, _Weakrefable - -from .array import StringArray, StringViewArray - -class StringBuilder(_Weakrefable): - """ - Builder class for UTF8 strings. - - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string'). - """ - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ - @property - def null_count(self) -> int: ... - def __len__(self) -> int: ... - -class StringViewBuilder(_Weakrefable): - """ - Builder class for UTF8 string views. - - This class exposes facilities for incrementally adding string values and - building the null bitmap for a pyarrow.Array (type='string_view'). - """ - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def append(self, value: str | bytes | None): - """ - Append a single value to the builder. - - The value can either be a string/bytes object or a null value - (np.nan or None). - - Parameters - ---------- - value : string/bytes or np.nan/None - The value to append to the string array builder. - """ - def append_values(self, values: Iterable[str | bytes | None]): - """ - Append all the values from an iterable. - - Parameters - ---------- - values : iterable of string/bytes or np.nan/None values - The values to append to the string array builder. - """ - def finish(self) -> StringViewArray: - """ - Return result of builder as an Array object; also resets the builder. - - Returns - ------- - array : pyarrow.Array - """ - @property - def null_count(self) -> int: ... - def __len__(self) -> int: ... - -__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/pyarrow-stubs/__lib_pxi/compat.pyi b/pyarrow-stubs/__lib_pxi/compat.pyi deleted file mode 100644 index ae667be453e..00000000000 --- a/pyarrow-stubs/__lib_pxi/compat.pyi +++ /dev/null @@ -1,5 +0,0 @@ -def encode_file_path(path: str | bytes) -> bytes: ... -def tobytes(o: str | bytes) -> bytes: ... -def frombytes(o: bytes, *, safe: bool = False): ... - -__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/pyarrow-stubs/__lib_pxi/config.pyi b/pyarrow-stubs/__lib_pxi/config.pyi deleted file mode 100644 index 166e10c9734..00000000000 --- a/pyarrow-stubs/__lib_pxi/config.pyi +++ /dev/null @@ -1,41 +0,0 @@ -from typing import NamedTuple - -class VersionInfo(NamedTuple): - major: int - minor: int - patch: int - -class BuildInfo(NamedTuple): - version: str - version_info: VersionInfo - so_version: str - full_so_version: str - compiler_id: str - compiler_version: str - compiler_flags: str - git_id: str - git_description: str - package_kind: str - build_type: str - -class RuntimeInfo(NamedTuple): - simd_level: str - detected_simd_level: str - -cpp_build_info: BuildInfo -cpp_version: str -cpp_version_info: VersionInfo - -def runtime_info() -> RuntimeInfo: ... -def set_timezone_db_path(path: str) -> None: ... - -__all__ = [ - "VersionInfo", - "BuildInfo", - "RuntimeInfo", - "cpp_build_info", - "cpp_version", - "cpp_version_info", - "runtime_info", - "set_timezone_db_path", -] diff --git a/pyarrow-stubs/__lib_pxi/device.pyi b/pyarrow-stubs/__lib_pxi/device.pyi deleted file mode 100644 index d1b9f39eedd..00000000000 --- a/pyarrow-stubs/__lib_pxi/device.pyi +++ /dev/null @@ -1,88 +0,0 @@ -import enum - -from pyarrow.lib import _Weakrefable - -class DeviceAllocationType(enum.Flag): - CPU = enum.auto() - CUDA = enum.auto() - CUDA_HOST = enum.auto() - OPENCL = enum.auto() - VULKAN = enum.auto() - METAL = enum.auto() - VPI = enum.auto() - ROCM = enum.auto() - ROCM_HOST = enum.auto() - EXT_DEV = enum.auto() - CUDA_MANAGED = enum.auto() - ONEAPI = enum.auto() - WEBGPU = enum.auto() - HEXAGON = enum.auto() - -class Device(_Weakrefable): - """ - Abstract interface for hardware devices - - This object represents a device with access to some memory spaces. - When handling a Buffer or raw memory address, it allows deciding in which - context the raw memory address should be interpreted - (e.g. CPU-accessible memory, or embedded memory on some particular GPU). - """ - - @property - def type_name(self) -> str: - """ - A shorthand for this device's type. - """ - @property - def device_id(self) -> int: - """ - A device ID to identify this device if there are multiple of this type. - - If there is no "device_id" equivalent (such as for the main CPU device on - non-numa systems) returns -1. - """ - @property - def is_cpu(self) -> bool: - """ - Whether this device is the main CPU device. - - This shorthand method is very useful when deciding whether a memory address - is CPU-accessible. - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - Return the DeviceAllocationType of this device. - """ - -class MemoryManager(_Weakrefable): - """ - An object that provides memory management primitives. - - A MemoryManager is always tied to a particular Device instance. - It can also have additional parameters (such as a MemoryPool to - allocate CPU memory). - - """ - @property - def device(self) -> Device: - """ - The device this MemoryManager is tied to. - """ - @property - def is_cpu(self) -> bool: - """ - Whether this MemoryManager is tied to the main CPU device. - - This shorthand method is very useful when deciding whether a memory - address is CPU-accessible. - """ - -def default_cpu_memory_manager() -> MemoryManager: - """ - Return the default CPU MemoryManager instance. - - The returned singleton instance uses the default MemoryPool. - """ - -__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/pyarrow-stubs/__lib_pxi/error.pyi b/pyarrow-stubs/__lib_pxi/error.pyi deleted file mode 100644 index 981ed51e680..00000000000 --- a/pyarrow-stubs/__lib_pxi/error.pyi +++ /dev/null @@ -1,53 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self - -class ArrowException(Exception): ... -class ArrowInvalid(ValueError, ArrowException): ... -class ArrowMemoryError(MemoryError, ArrowException): ... -class ArrowKeyError(KeyError, ArrowException): ... -class ArrowTypeError(TypeError, ArrowException): ... -class ArrowNotImplementedError(NotImplementedError, ArrowException): ... -class ArrowCapacityError(ArrowException): ... -class ArrowIndexError(IndexError, ArrowException): ... -class ArrowSerializationError(ArrowException): ... - -class ArrowCancelled(ArrowException): - signum: int | None - def __init__(self, message: str, signum: int | None = None) -> None: ... - -ArrowIOError = IOError - -class StopToken: ... - -def enable_signal_handlers(enable: bool) -> None: ... - -have_signal_refcycle: bool - -class SignalStopHandler: - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... - def __dealloc__(self) -> None: ... - @property - def stop_token(self) -> StopToken: ... - -__all__ = [ - "ArrowException", - "ArrowInvalid", - "ArrowMemoryError", - "ArrowKeyError", - "ArrowTypeError", - "ArrowNotImplementedError", - "ArrowCapacityError", - "ArrowIndexError", - "ArrowSerializationError", - "ArrowCancelled", - "ArrowIOError", - "StopToken", - "enable_signal_handlers", - "have_signal_refcycle", - "SignalStopHandler", -] diff --git a/pyarrow-stubs/__lib_pxi/io.pyi b/pyarrow-stubs/__lib_pxi/io.pyi deleted file mode 100644 index d882fd79d57..00000000000 --- a/pyarrow-stubs/__lib_pxi/io.pyi +++ /dev/null @@ -1,1474 +0,0 @@ -import sys - -from collections.abc import Callable -from io import IOBase - -from _typeshed import StrPath - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from typing import Any, Literal, SupportsIndex, overload - -from pyarrow._stubs_typing import Compression, SupportPyBuffer -from pyarrow.lib import MemoryPool, _Weakrefable - -from .device import Device, DeviceAllocationType, MemoryManager -from .types import KeyValueMetadata - -def have_libhdfs() -> bool: - """ - Return true if HDFS (HadoopFileSystem) library is set up correctly. - """ - -def io_thread_count() -> int: - """ - Return the number of threads to use for I/O operations. - - Many operations, such as scanning a dataset, will implicitly make - use of this pool. The number of threads is set to a fixed value at - startup. It can be modified at runtime by calling - :func:`set_io_thread_count()`. - - See Also - -------- - set_io_thread_count : Modify the size of this pool. - cpu_count : The analogous function for the CPU thread pool. - """ - -def set_io_thread_count(count: int) -> None: - """ - Set the number of threads to use for I/O operations. - - Many operations, such as scanning a dataset, will implicitly make - use of this pool. - - Parameters - ---------- - count : int - The max number of threads that may be used for I/O. - Must be positive. - - See Also - -------- - io_thread_count : Get the size of this pool. - set_cpu_count : The analogous function for the CPU thread pool. - """ - -Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] - -class NativeFile(_Weakrefable): - """ - The base class for all Arrow streams. - - Streams are either readable, writable, or both. - They optionally support seeking. - - While this class exposes methods to read or write data from Python, the - primary intent of using a Arrow stream is to pass it to other Arrow - facilities that will make use of it, such as Arrow IPC routines. - - Be aware that there are subtle differences with regular Python files, - e.g. destroying a writable Arrow stream without closing it explicitly - will not flush any pending data. - """ - - _default_chunk_size: int - - def __enter__(self) -> Self: ... - def __exit__(self, *args) -> None: ... - @property - def mode(self) -> Mode: - """ - The file mode. Currently instances of NativeFile may support: - - * rb: binary read - * wb: binary write - * rb+: binary read and write - * ab: binary append - """ - def readable(self) -> bool: ... - def seekable(self) -> bool: ... - def isatty(self) -> bool: ... - def fileno(self) -> int: ... - @property - def closed(self) -> bool: ... - def close(self) -> None: ... - def size(self) -> int: - """ - Return file size - """ - def metadata(self) -> KeyValueMetadata: - """ - Return file metadata - """ - def tell(self) -> int: - """ - Return current stream position - """ - def seek(self, position: int, whence: int = 0) -> int: - """ - Change current file stream position - - Parameters - ---------- - position : int - Byte offset, interpreted relative to value of whence argument - whence : int, default 0 - Point of reference for seek offset - - Notes - ----- - Values of whence: - * 0 -- start of stream (the default); offset should be zero or positive - * 1 -- current stream position; offset may be negative - * 2 -- end of stream; offset is usually negative - - Returns - ------- - int - The new absolute stream position. - """ - def flush(self) -> None: - """ - Flush the stream, if applicable. - - An error is raised if stream is not writable. - """ - def write(self, data: bytes | SupportPyBuffer) -> int: - """ - Write data to the file. - - Parameters - ---------- - data : bytes-like object or exporter of buffer protocol - - Returns - ------- - int - nbytes: number of bytes written - """ - def read(self, nbytes: int | None = None) -> bytes: - """ - Read and return up to n bytes. - - If *nbytes* is None, then the entire remaining file contents are read. - - Parameters - ---------- - nbytes : int, default None - - Returns - ------- - data : bytes - """ - def get_stream(self, file_offset: int, nbytes: int) -> Self: - """ - Return an input stream that reads a file segment independent of the - state of the file. - - Allows reading portions of a random access file as an input stream - without interfering with each other. - - Parameters - ---------- - file_offset : int - nbytes : int - - Returns - ------- - stream : NativeFile - """ - def read_at(self) -> bytes: - """ - Read indicated number of bytes at offset from the file - - Parameters - ---------- - nbytes : int - offset : int - - Returns - ------- - data : bytes - """ - def read1(self) -> bytes: - """Read and return up to n bytes. - - Unlike read(), if *nbytes* is None then a chunk is read, not the - entire file. - - Parameters - ---------- - nbytes : int, default None - The maximum number of bytes to read. - - Returns - ------- - data : bytes - """ - def readall(self) -> bytes: ... - def readinto(self, b: SupportPyBuffer) -> int: - """ - Read into the supplied buffer - - Parameters - ---------- - b : buffer-like object - A writable buffer object (such as a bytearray). - - Returns - ------- - written : int - number of bytes written - """ - - def readline(self, size: int | None = None) -> bytes: - """Read and return a line of bytes from the file. - - If size is specified, read at most size bytes. - - Line terminator is always b"\\n". - - Parameters - ---------- - size : int - maximum number of bytes read - """ - def readlines(self, hint: int | None = None) -> list[bytes]: - """Read lines of the file - - Parameters - ---------- - hint : int - maximum number of bytes read until we stop - """ - def __iter__(self) -> Self: ... - def __next__(self) -> bytes: ... - def read_buffer(self, nbytes: int | None = None) -> Buffer: - """ - Read from buffer. - - Parameters - ---------- - nbytes : int, optional - maximum number of bytes read - """ - def truncate(self) -> None: ... - def writelines(self, lines: list[bytes]): - """ - Write lines to the file. - - Parameters - ---------- - lines : iterable - Iterable of bytes-like objects or exporters of buffer protocol - """ - def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: - """ - Read this file completely to a local path or destination stream. - - This method first seeks to the beginning of the file. - - Parameters - ---------- - stream_or_path : str or file-like object - If a string, a local file path to write to; otherwise, - should be a writable stream. - buffer_size : int, optional - The buffer size to use for data transfers. - """ - def upload(self, stream: IOBase, buffer_size: int | None) -> None: - """ - Write from a source stream to this file. - - Parameters - ---------- - stream : file-like object - Source stream to pipe to this file. - buffer_size : int, optional - The buffer size to use for data transfers. - """ - -# ---------------------------------------------------------------------- -# Python file-like objects - -class PythonFile(NativeFile): - """ - A stream backed by a Python file object. - - This class allows using Python file objects with arbitrary Arrow - functions, including functions written in another language than Python. - - As a downside, there is a non-zero redirection cost in translating - Arrow stream calls to Python method calls. Furthermore, Python's - Global Interpreter Lock may limit parallelism in some situations. - - Examples - -------- - >>> import io - >>> import pyarrow as pa - >>> pa.PythonFile(io.BytesIO()) - - - Create a stream for writing: - - >>> buf = io.BytesIO() - >>> f = pa.PythonFile(buf, mode="w") - >>> f.writable() - True - >>> f.write(b"PythonFile") - 10 - >>> buf.getvalue() - b'PythonFile' - >>> f.close() - >>> f - - - Create a stream for reading: - - >>> buf = io.BytesIO(b"PythonFile") - >>> f = pa.PythonFile(buf, mode="r") - >>> f.mode - 'rb' - >>> f.read() - b'PythonFile' - >>> f - - >>> f.close() - >>> f - - """ - def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... - def truncate(self, pos: int | None = None) -> None: - """ - Parameters - ---------- - pos : int, optional - """ - -class MemoryMappedFile(NativeFile): - """ - A stream that represents a memory-mapped file. - - Supports 'r', 'r+', 'w' modes. - - Examples - -------- - Create a new file with memory map: - - >>> import pyarrow as pa - >>> mmap = pa.create_memory_map("example_mmap.dat", 10) - >>> mmap - - >>> mmap.close() - - Open an existing file with memory map: - - >>> with pa.memory_map("example_mmap.dat") as mmap: - ... mmap - - """ - @classmethod - def create(cls, path: str, size: int) -> Self: - """ - Create a MemoryMappedFile - - Parameters - ---------- - path : str - Where to create the file. - size : int - Size of the memory mapped file. - """ - def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... - def resize(self, new_size: int) -> None: - """ - Resize the map and underlying file. - - Parameters - ---------- - new_size : new size in bytes - """ - -def memory_map( - path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" -) -> MemoryMappedFile: - """ - Open memory map at file path. Size of the memory map cannot change. - - Parameters - ---------- - path : str - mode : {'r', 'r+', 'w'}, default 'r' - Whether the file is opened for reading ('r'), writing ('w') - or both ('r+'). - - Returns - ------- - mmap : MemoryMappedFile - - Examples - -------- - Reading from a memory map without any memory allocation or copying: - - >>> import pyarrow as pa - >>> with pa.output_stream("example_mmap.txt") as stream: - ... stream.write(b"Constructing a buffer referencing the mapped memory") - 51 - >>> with pa.memory_map("example_mmap.txt") as mmap: - ... mmap.read_at(6, 45) - b'memory' - """ - -create_memory_map = MemoryMappedFile.create - -class OSFile(NativeFile): - """ - A stream backed by a regular file descriptor. - - Examples - -------- - Create a new file to write to: - - >>> import pyarrow as pa - >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: - ... f.writable() - ... f.write(b"OSFile") - ... f.seekable() - True - 6 - False - - Open the file to read: - - >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: - ... f.mode - ... f.read() - 'rb' - b'OSFile' - - Open the file to append: - - >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: - ... f.mode - ... f.write(b" is super!") - 'ab' - 10 - >>> with pa.OSFile("example_osfile.arrow") as f: - ... f.read() - b'OSFile is super!' - - Inspect created OSFile: - - >>> pa.OSFile("example_osfile.arrow") - - """ - def __init__( - self, - path: str, - mode: Literal["r", "rb", "w", "wb", "a", "ab"], - memory_pool: MemoryPool | None = None, - ) -> None: ... - -class FixedSizeBufferWriter(NativeFile): - """ - A stream writing to a Arrow buffer. - - Examples - -------- - Create a stream to write to ``pyarrow.Buffer``: - - >>> import pyarrow as pa - >>> buf = pa.allocate_buffer(5) - >>> with pa.output_stream(buf) as stream: - ... stream.write(b"abcde") - ... stream - 5 - - - Inspect the buffer: - - >>> buf.to_pybytes() - b'abcde' - >>> buf - - """ - def __init__(self, buffer: Buffer) -> None: ... - def set_memcopy_threads(self, num_threads: int) -> None: ... - def set_memcopy_blocksize(self, blocksize: int) -> None: ... - def set_memcopy_threshold(self, threshold: int) -> None: ... - -# ---------------------------------------------------------------------- -# Arrow buffers - -class Buffer(_Weakrefable): - """ - The base class for all Arrow buffers. - - A buffer represents a contiguous memory area. Many buffers will own - their memory, though not all of them do. - """ - def __len__(self) -> int: ... - def _assert_cpu(self) -> None: ... - @property - def size(self) -> int: - """ - The buffer size in bytes. - """ - @property - def address(self) -> int: - """ - The buffer's address, as an integer. - - The returned address may point to CPU or device memory. - Use `is_cpu()` to disambiguate. - """ - def hex(self) -> bytes: - """ - Compute hexadecimal representation of the buffer. - - Returns - ------- - : bytes - """ - @property - def is_mutable(self) -> bool: - """ - Whether the buffer is mutable. - """ - @property - def is_cpu(self) -> bool: - """ - Whether the buffer is CPU-accessible. - """ - @property - def device(self) -> Device: - """ - The device where the buffer resides. - - Returns - ------- - Device - """ - @property - def memory_manager(self) -> MemoryManager: - """ - The memory manager associated with the buffer. - - Returns - ------- - MemoryManager - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the buffer resides. - - Returns - ------- - DeviceAllocationType - """ - @property - def parent(self) -> Buffer | None: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - @overload - def __getitem__(self, key: int) -> int: ... - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Slice this buffer. Memory is not copied. - - You can also use the Python slice notation ``buffer[start:stop]``. - - Parameters - ---------- - offset : int, default 0 - Offset from start of buffer to slice. - length : int, default None - Length of slice (default is until end of Buffer starting from - offset). - - Returns - ------- - sliced : Buffer - A logical view over this buffer. - """ - def equals(self, other: Self) -> bool: - """ - Determine if two buffers contain exactly the same data. - - Parameters - ---------- - other : Buffer - - Returns - ------- - are_equal : bool - True if buffer contents and size are equal - """ - def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... - def to_pybytes(self) -> bytes: - """ - Return this buffer as a Python bytes object. Memory is copied. - """ - def __buffer__(self, flags: int, /) -> memoryview: ... - -class ResizableBuffer(Buffer): - """ - A base class for buffers that can be resized. - """ - - def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: - """ - Resize buffer to indicated size. - - Parameters - ---------- - new_size : int - New size of buffer (padding may be added internally). - shrink_to_fit : bool, default False - If this is true, the buffer is shrunk when new_size is less - than the current size. - If this is false, the buffer is never shrunk. - """ - -@overload -def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... -@overload -def allocate_buffer( - size: int, memory_pool: MemoryPool | None, resizable: Literal[False] -) -> Buffer: ... -@overload -def allocate_buffer( - size: int, memory_pool: MemoryPool | None, resizable: Literal[True] -) -> ResizableBuffer: ... -def allocate_buffer(*args, **kwargs): - """ - Allocate a mutable buffer. - - Parameters - ---------- - size : int - Number of bytes to allocate (plus internal padding) - memory_pool : MemoryPool, optional - The pool to allocate memory from. - If not given, the default memory pool is used. - resizable : bool, default False - If true, the returned buffer is resizable. - - Returns - ------- - buffer : Buffer or ResizableBuffer - """ - -# ---------------------------------------------------------------------- -# Arrow Stream -class BufferOutputStream(NativeFile): - """ - An output stream that writes to a resizable buffer. - - The buffer is produced as a result when ``getvalue()`` is called. - - Examples - -------- - Create an output stream, write data to it and finalize it with - ``getvalue()``: - - >>> import pyarrow as pa - >>> f = pa.BufferOutputStream() - >>> f.write(b"pyarrow.Buffer") - 14 - >>> f.closed - False - >>> f.getvalue() - - >>> f.closed - True - """ - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def getvalue(self) -> Buffer: - """ - Finalize output stream and return result as pyarrow.Buffer. - - Returns - ------- - value : Buffer - """ - -class MockOutputStream(NativeFile): ... - -class BufferReader(NativeFile): - """ - Zero-copy reader from objects convertible to Arrow buffer. - - Parameters - ---------- - obj : Python bytes or pyarrow.Buffer - - Examples - -------- - Create an Arrow input stream and inspect it: - - >>> import pyarrow as pa - >>> data = b"reader data" - >>> buf = memoryview(data) - >>> with pa.input_stream(buf) as stream: - ... stream.size() - ... stream.read(6) - ... stream.seek(7) - ... stream.read(15) - 11 - b'reader' - 7 - b'data' - """ - def __init__(self, obj) -> None: ... - -class CompressedInputStream(NativeFile): - """ - An input stream wrapper which decompresses data on the fly. - - Parameters - ---------- - stream : string, path, pyarrow.NativeFile, or file-like object - Input stream object to wrap with the compression. - compression : str - The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). - - Examples - -------- - Create an output stream which compresses the data: - - >>> import pyarrow as pa - >>> data = b"Compressed stream" - >>> raw = pa.BufferOutputStream() - >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: - ... compressed.write(data) - 17 - - Create an input stream with decompression referencing the - buffer with compressed data: - - >>> cdata = raw.getvalue() - >>> with pa.input_stream(cdata, compression="gzip") as compressed: - ... compressed.read() - b'Compressed stream' - - which actually translates to the use of ``BufferReader``and - ``CompressedInputStream``: - - >>> raw = pa.BufferReader(cdata) - >>> with pa.CompressedInputStream(raw, "gzip") as compressed: - ... compressed.read() - b'Compressed stream' - """ - - def __init__( - self, - stream: StrPath | NativeFile | IOBase, - compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... - -class CompressedOutputStream(NativeFile): - """ - An output stream wrapper which compresses data on the fly. - - Parameters - ---------- - stream : string, path, pyarrow.NativeFile, or file-like object - Input stream object to wrap with the compression. - compression : str - The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). - - Examples - -------- - Create an output stream which compresses the data: - - >>> import pyarrow as pa - >>> data = b"Compressed stream" - >>> raw = pa.BufferOutputStream() - >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: - ... compressed.write(data) - 17 - """ - def __init__( - self, - stream: StrPath | NativeFile | IOBase, - compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], - ) -> None: ... - -class BufferedInputStream(NativeFile): - """ - An input stream that performs buffered reads from - an unbuffered input stream, which can mitigate the overhead - of many small reads in some cases. - - Parameters - ---------- - stream : NativeFile - The input stream to wrap with the buffer - buffer_size : int - Size of the temporary read buffer. - memory_pool : MemoryPool - The memory pool used to allocate the buffer. - """ - def __init__( - self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: ... - def detach(self) -> NativeFile: - """ - Release the raw InputStream. - Further operations on this stream are invalid. - - Returns - ------- - raw : NativeFile - The underlying raw input stream - """ - -class BufferedOutputStream(NativeFile): - """ - An output stream that performs buffered reads from - an unbuffered output stream, which can mitigate the overhead - of many small writes in some cases. - - Parameters - ---------- - stream : NativeFile - The writable output stream to wrap with the buffer - buffer_size : int - Size of the buffer that should be added. - memory_pool : MemoryPool - The memory pool used to allocate the buffer. - """ - def __init__( - self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None - ) -> None: ... - def detach(self) -> NativeFile: - """ - Flush any buffered writes and release the raw OutputStream. - Further operations on this stream are invalid. - - Returns - ------- - raw : NativeFile - The underlying raw output stream. - """ - -class TransformInputStream(NativeFile): - """ - Transform an input stream. - - Parameters - ---------- - stream : NativeFile - The stream to transform. - transform_func : callable - The transformation to apply. - """ - def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... - -class Transcoder: - def __init__(self, decoder, encoder) -> None: ... - def __call__(self, buf: Buffer): ... - -def transcoding_input_stream( - stream: NativeFile, src_encoding: str, dest_encoding: str -) -> TransformInputStream: - """ - Add a transcoding transformation to the stream. - Incoming data will be decoded according to ``src_encoding`` and - then re-encoded according to ``dest_encoding``. - - Parameters - ---------- - stream : NativeFile - The stream to which the transformation should be applied. - src_encoding : str - The codec to use when reading data. - dest_encoding : str - The codec to use for emitted data. - """ - -def py_buffer(obj: SupportPyBuffer) -> Buffer: - """ - Construct an Arrow buffer from a Python bytes-like or buffer-like object - - Parameters - ---------- - obj : object - the object from which the buffer should be constructed. - """ - -def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: - """ - Construct an Arrow buffer with the given *address* and *size*. - - The buffer will be optionally backed by the Python *base* object, if given. - The *base* object will be kept alive as long as this buffer is alive, - including across language boundaries (for example if the buffer is - referenced by C++ code). - - Parameters - ---------- - address : int - The starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - The size of device buffer in bytes. - base : {None, object} - Object that owns the referenced memory. - """ - -def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... - -# --------------------------------------------------------------------- - -class CacheOptions(_Weakrefable): - """ - Cache options for a pre-buffered fragment scan. - - Parameters - ---------- - hole_size_limit : int, default 8KiB - The maximum distance in bytes between two consecutive ranges; beyond - this value, ranges are not combined. - range_size_limit : int, default 32MiB - The maximum size in bytes of a combined range; if combining two - consecutive ranges would produce a range of a size greater than this, - they are not combined - lazy : bool, default True - lazy = false: request all byte ranges when PreBuffer or WillNeed is called. - lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader - needs them. - lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the - range that is currently being read. - prefetch_limit : int, default 0 - The maximum number of ranges to be prefetched. This is only used for - lazy cache to asynchronously read some ranges after reading the target - range. - """ - - hole_size_limit: int - range_size_limit: int - lazy: bool - prefetch_limit: int - def __init__( - self, - *, - hole_size_limit: int | None = None, - range_size_limit: int | None = None, - lazy: bool = True, - prefetch_limit: int = 0, - ) -> None: ... - @classmethod - def from_network_metrics( - cls, - time_to_first_byte_millis: int, - transfer_bandwidth_mib_per_sec: int, - ideal_bandwidth_utilization_frac: float = 0.9, - max_ideal_request_size_mib: int = 64, - ) -> Self: - """ - Create suitable CacheOptions based on provided network metrics. - - Typically this will be used with object storage solutions like Amazon S3, - Google Cloud Storage and Azure Blob Storage. - - Parameters - ---------- - time_to_first_byte_millis : int - Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call - setup latency of a new read request. The value is a positive integer. - transfer_bandwidth_mib_per_sec : int - Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive - integer. - ideal_bandwidth_utilization_frac : int, default 0.9 - Transfer bandwidth utilization fraction (per connection) to maximize the net - data load. The value is a positive float less than 1. - max_ideal_request_size_mib : int, default 64 - The maximum single data request size (in MiB) to maximize the net data load. - - Returns - ------- - CacheOptions - """ - -class Codec(_Weakrefable): - """ - Compression codec. - - Parameters - ---------- - compression : str - Type of compression codec to initialize, valid values are: 'gzip', - 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and - 'snappy'. - compression_level : int, None - Optional parameter specifying how aggressively to compress. The - possible ranges and effect of this parameter depend on the specific - codec chosen. Higher values compress more but typically use more - resources (CPU/RAM). Some codecs support negative values. - - gzip - The compression_level maps to the memlevel parameter of - deflateInit2. Higher levels use more RAM but are faster - and should have higher compression ratios. - - bz2 - The compression level maps to the blockSize100k parameter of - the BZ2_bzCompressInit function. Higher levels use more RAM - but are faster and should have higher compression ratios. - - brotli - The compression level maps to the BROTLI_PARAM_QUALITY - parameter. Higher values are slower and should have higher - compression ratios. - - lz4/lz4_frame/lz4_raw - The compression level parameter is not supported and must - be None - - zstd - The compression level maps to the compressionLevel parameter - of ZSTD_initCStream. Negative values are supported. Higher - values are slower and should have higher compression ratios. - - snappy - The compression level parameter is not supported and must - be None - - - Raises - ------ - ValueError - If invalid compression value is passed. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.Codec.is_available("gzip") - True - >>> codec = pa.Codec("gzip") - >>> codec.name - 'gzip' - >>> codec.compression_level - 9 - """ - def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... - @classmethod - def detect(cls, path: StrPath) -> Self: - """ - Detect and instantiate compression codec based on file extension. - - Parameters - ---------- - path : str, path-like - File-path to detect compression from. - - Raises - ------ - TypeError - If the passed value is not path-like. - ValueError - If the compression can't be detected from the path. - - Returns - ------- - Codec - """ - @staticmethod - def is_available(compression: Compression) -> bool: - """ - Returns whether the compression support has been built and enabled. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - - Returns - ------- - bool - """ - @staticmethod - def supports_compression_level(compression: Compression) -> int: - """ - Returns true if the compression level parameter is supported - for the given codec. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ - @staticmethod - def default_compression_level(compression: Compression) -> int: - """ - Returns the compression level that Arrow will use for the codec if - None is specified. - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ - @staticmethod - def minimum_compression_level(compression: Compression) -> int: - """ - Returns the smallest valid value for the compression level - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ - @staticmethod - def maximum_compression_level(compression: Compression) -> int: - """ - Returns the largest valid value for the compression level - - Parameters - ---------- - compression : str - Type of compression codec, - refer to Codec docstring for a list of supported ones. - """ - @property - def name(self) -> Compression: - """Returns the name of the codec""" - @property - def compression_level(self) -> int: - """Returns the compression level parameter of the codec""" - @overload - def compress( - self, - buf: Buffer | bytes | SupportPyBuffer, - *, - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def compress( - self, - buf: Buffer | bytes | SupportPyBuffer, - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def compress( - self, - buf: Buffer | bytes | SupportPyBuffer, - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, - ) -> bytes: ... - def compress(self, *args, **kwargs): - """ - Compress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any - - Returns - ------- - compressed : pyarrow.Buffer or bytes (if asbytes=True) - """ - @overload - def decompress( - self, - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def decompress( - self, - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, - ) -> Buffer: ... - @overload - def decompress( - self, - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, - ) -> bytes: ... - def decompress(self, *args, **kwargs): - """ - Decompress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or memoryview-compatible object - decompressed_size : int, default None - Size of the decompressed result - asbytes : boolean, default False - Return result as Python bytes object, otherwise Buffer - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - uncompressed : pyarrow.Buffer or bytes (if asbytes=True) - """ - -@overload -def compress( - buf: Buffer | bytes | SupportPyBuffer, - codec: Compression = "lz4", - *, - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def compress( - buf: Buffer | bytes | SupportPyBuffer, - codec: Compression = "lz4", - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def compress( - buf: Buffer | bytes | SupportPyBuffer, - codec: Compression = "lz4", - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, -) -> bytes: ... -def compress(*args, **kwargs): - """ - Compress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol - codec : str, default 'lz4' - Compression codec. - Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer. - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - compressed : pyarrow.Buffer or bytes (if asbytes=True) - """ - -@overload -def decompress( - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - codec: Compression = "lz4", - *, - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def decompress( - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - codec: Compression = "lz4", - *, - asbytes: Literal[False], - memory_pool: MemoryPool | None = None, -) -> Buffer: ... -@overload -def decompress( - buf: Buffer | bytes | SupportPyBuffer, - decompressed_size: int | None = None, - codec: Compression = "lz4", - *, - asbytes: Literal[True], - memory_pool: MemoryPool | None = None, -) -> bytes: ... -def decompress(*args, **kwargs): - """ - Decompress data from buffer-like object. - - Parameters - ---------- - buf : pyarrow.Buffer, bytes, or memoryview-compatible object - Input object to decompress data from. - decompressed_size : int, default None - Size of the decompressed result - codec : str, default 'lz4' - Compression codec. - Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} - asbytes : bool, default False - Return result as Python bytes object, otherwise Buffer. - memory_pool : MemoryPool, default None - Memory pool to use for buffer allocations, if any. - - Returns - ------- - uncompressed : pyarrow.Buffer or bytes (if asbytes=True) - """ - -def input_stream( - source: StrPath | Buffer | IOBase, - compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", - buffer_size: int | None = None, -) -> BufferReader: - """ - Create an Arrow input stream. - - Parameters - ---------- - source : str, Path, buffer, or file-like object - The source to open for reading. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly decompression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. - Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size : int, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary read buffer. - - Examples - -------- - Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: - - >>> import pyarrow as pa - >>> buf = memoryview(b"some data") - >>> with pa.input_stream(buf) as stream: - ... stream.read(4) - b'some' - - Create a readable OSFile (NativeFile) from a string or file path: - - >>> import gzip - >>> with gzip.open("example.gz", "wb") as f: - ... f.write(b"some data") - 9 - >>> with pa.input_stream("example.gz") as stream: - ... stream.read() - b'some data' - - Create a readable PythonFile (NativeFile) from a a Python file object: - - >>> with open("example.txt", mode="w") as f: - ... f.write("some text") - 9 - >>> with pa.input_stream("example.txt") as stream: - ... stream.read(6) - b'some t' - """ - -def output_stream( - source: StrPath | Buffer | IOBase, - compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", - buffer_size: int | None = None, -) -> NativeFile: - """ - Create an Arrow output stream. - - Parameters - ---------- - source : str, Path, buffer, file-like object - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. - Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size : int, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - - Examples - -------- - Create a writable NativeFile from a pyarrow Buffer: - - >>> import pyarrow as pa - >>> data = b"buffer data" - >>> empty_obj = bytearray(11) - >>> buf = pa.py_buffer(empty_obj) - >>> with pa.output_stream(buf) as stream: - ... stream.write(data) - 11 - >>> with pa.input_stream(buf) as stream: - ... stream.read(6) - b'buffer' - - or from a memoryview object: - - >>> buf = memoryview(empty_obj) - >>> with pa.output_stream(buf) as stream: - ... stream.write(data) - 11 - >>> with pa.input_stream(buf) as stream: - ... stream.read() - b'buffer data' - - Create a writable NativeFile from a string or file path: - - >>> with pa.output_stream("example_second.txt") as stream: - ... stream.write(b"Write some data") - 15 - >>> with pa.input_stream("example_second.txt") as stream: - ... stream.read() - b'Write some data' - """ - -__all__ = [ - "have_libhdfs", - "io_thread_count", - "set_io_thread_count", - "NativeFile", - "PythonFile", - "MemoryMappedFile", - "memory_map", - "create_memory_map", - "OSFile", - "FixedSizeBufferWriter", - "Buffer", - "ResizableBuffer", - "allocate_buffer", - "BufferOutputStream", - "MockOutputStream", - "BufferReader", - "CompressedInputStream", - "CompressedOutputStream", - "BufferedInputStream", - "BufferedOutputStream", - "TransformInputStream", - "Transcoder", - "transcoding_input_stream", - "py_buffer", - "foreign_buffer", - "as_buffer", - "CacheOptions", - "Codec", - "compress", - "decompress", - "input_stream", - "output_stream", -] diff --git a/pyarrow-stubs/__lib_pxi/ipc.pyi b/pyarrow-stubs/__lib_pxi/ipc.pyi deleted file mode 100644 index 3d72892061e..00000000000 --- a/pyarrow-stubs/__lib_pxi/ipc.pyi +++ /dev/null @@ -1,705 +0,0 @@ -import enum -import sys - -from io import IOBase - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Iterable, Iterator, Literal, Mapping, NamedTuple - -import pandas as pd - -from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer -from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable - -from .io import Buffer, Codec, NativeFile -from .types import DictionaryMemo, KeyValueMetadata - -class MetadataVersion(enum.IntEnum): - V1 = enum.auto() - V2 = enum.auto() - V3 = enum.auto() - V4 = enum.auto() - V5 = enum.auto() - -class WriteStats(NamedTuple): - """IPC write statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ - - num_messages: int - num_record_batches: int - num_dictionary_batches: int - num_dictionary_deltas: int - num_replaced_dictionaries: int - -class ReadStats(NamedTuple): - """IPC read statistics - - Parameters - ---------- - num_messages : int - Number of messages. - num_record_batches : int - Number of record batches. - num_dictionary_batches : int - Number of dictionary batches. - num_dictionary_deltas : int - Delta of dictionaries. - num_replaced_dictionaries : int - Number of replaced dictionaries. - """ - - num_messages: int - num_record_batches: int - num_dictionary_batches: int - num_dictionary_deltas: int - num_replaced_dictionaries: int - -class IpcReadOptions(_Weakrefable): - """ - Serialization options for reading IPC format. - - Parameters - ---------- - ensure_native_endian : bool, default True - Whether to convert incoming data to platform-native endianness. - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like decompression - included_fields : list - If empty (the default), return all deserialized fields. - If non-empty, the values are the indices of fields to read on - the top-level schema - """ - - ensure_native_endian: bool - use_threads: bool - included_fields: list[int] - def __init__( - self, - *, - ensure_native_endian: bool = True, - use_threads: bool = True, - included_fields: list[int] | None = None, - ) -> None: ... - -class IpcWriteOptions(_Weakrefable): - """ - Serialization options for the IPC format. - - Parameters - ---------- - metadata_version : MetadataVersion, default MetadataVersion.V5 - The metadata version to write. V5 is the current and latest, - V4 is the pre-1.0 metadata version (with incompatible Union layout). - allow_64bit : bool, default False - If true, allow field lengths that don't fit in a signed 32-bit int. - use_legacy_format : bool, default False - Whether to use the pre-Arrow 0.15 IPC format. - compression : str, Codec, or None - compression codec to use for record batch buffers. - If None then batch buffers will be uncompressed. - Must be "lz4", "zstd" or None. - To specify a compression_level use `pyarrow.Codec` - use_threads : bool - Whether to use the global CPU thread pool to parallelize any - computational tasks like compression. - emit_dictionary_deltas : bool - Whether to emit dictionary deltas. Default is false for maximum - stream compatibility. - unify_dictionaries : bool - If true then calls to write_table will attempt to unify dictionaries - across all batches in the table. This can help avoid the need for - replacement dictionaries (which the file format does not support) - but requires computing the unified dictionary and then remapping - the indices arrays. - - This parameter is ignored when writing to the IPC stream format as - the IPC stream format can support replacement dictionaries. - """ - - metadata_version: MetadataVersion - allow_64bit: bool - use_legacy_format: bool - compression: Codec | Literal["lz4", "zstd"] | None - use_threads: bool - emit_dictionary_deltas: bool - unify_dictionaries: bool - def __init__( - self, - *, - metadata_version: MetadataVersion = MetadataVersion.V5, - allow_64bit: bool = False, - use_legacy_format: bool = False, - compression: Codec | Literal["lz4", "zstd"] | None = None, - use_threads: bool = True, - emit_dictionary_deltas: bool = False, - unify_dictionaries: bool = False, - ) -> None: ... - -class Message(_Weakrefable): - """ - Container for an Arrow IPC message with metadata and optional body - """ - - @property - def type(self) -> str: ... - @property - def metadata(self) -> Buffer: ... - @property - def metadata_version(self) -> MetadataVersion: ... - @property - def body(self) -> Buffer | None: ... - def equals(self, other: Message) -> bool: ... - def serialize_to( - self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None - ): - """ - Write message to generic OutputStream - - Parameters - ---------- - sink : NativeFile - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - """ - def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write message as encapsulated IPC message - - Parameters - ---------- - alignment : int, default 8 - Byte alignment for metadata and body - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - """ - -class MessageReader(_Weakrefable): - """ - Interface for reading Message objects from some source (like an - InputStream) - """ - @classmethod - def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: - """ - Open stream from source, if you want to use memory map use - MemoryMappedFile as source. - - Parameters - ---------- - source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object - A readable source, like an InputStream - """ - def __iter__(self) -> Self: ... - def read_next_message(self) -> Message: - """ - Read next Message from the stream. - - Raises - ------ - StopIteration - At end of stream - """ - __next__ = read_next_message - -# ---------------------------------------------------------------------- -# File and stream readers and writers - -class _CRecordBatchWriter(_Weakrefable): - """The base RecordBatchWriter wrapper. - - Provides common implementations of convenience methods. Should not - be instantiated directly by user code. - """ - def write(self, table_or_batch: Table | RecordBatch): - """ - Write RecordBatch or Table to stream. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - """ - def write_batch( - self, - batch: RecordBatch, - custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, - ): - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - custom_metadata : mapping or KeyValueMetadata - Keys and values must be string-like / coercible to bytes - """ - def write_table(self, table: Table, max_chunksize: int | None = None) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - @property - def stats(self) -> WriteStats: - """ - Current IPC write statistics. - """ - -class _RecordBatchStreamWriter(_CRecordBatchWriter): - def __dealloc__(self) -> None: ... - def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... - -class _ReadPandasMixin: - def read_pandas(self, **options) -> pd.DataFrame: - """ - Read contents of stream to a pandas.DataFrame. - - Read all record batches as a pyarrow.Table then convert it to a - pandas.DataFrame using Table.to_pandas. - - Parameters - ---------- - **options - Arguments to forward to :meth:`Table.to_pandas`. - - Returns - ------- - df : pandas.DataFrame - """ - -class RecordBatchReader(_Weakrefable): - """Base class for reading stream of record batches. - - Record batch readers function as iterators of record batches that also - provide the schema (without the need to get any batches). - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatchReader.from_*`` functions instead. - - Notes - ----- - To import and export using the Arrow C stream interface, use the - ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this - interface is intended for expert users. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([("x", pa.int64())]) - >>> def iter_record_batches(): - ... for i in range(2): - ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) - >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) - >>> print(reader.schema) - x: int64 - >>> for batch in reader: - ... print(batch) - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - pyarrow.RecordBatch - x: int64 - ---- - x: [1,2,3] - """ - - def __iter__(self) -> Self: ... - def read_next_batch(self) -> RecordBatch: - """ - Read next RecordBatch from the stream. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - RecordBatch - """ - __next__ = read_next_batch - @property - def schema(self) -> Schema: - """ - Shared schema of the record batches in the stream. - - Returns - ------- - Schema - """ - def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: - """ - Read next RecordBatch from the stream along with its custom metadata. - - Raises - ------ - StopIteration: - At end of stream. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - def iter_batches_with_custom_metadata( - self, - ) -> Iterator[RecordBatchWithMetadata]: - """ - Iterate over record batches from the stream along with their custom - metadata. - - Yields - ------ - RecordBatchWithMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table. - - Returns - ------- - Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def close(self) -> None: - """ - Release any resources associated with the reader. - """ - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - def cast(self, target_schema: Schema) -> Self: - """ - Wrap this reader with one that casts each batch lazily as it is pulled. - Currently only a safe cast to target_schema is implemented. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - - Returns - ------- - RecordBatchReader - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowArrayStream struct, given its pointer. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - Be careful: if you don't pass the ArrowArrayStream struct to a - consumer, array memory will leak. This is a low-level function - intended for expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream struct, - given its pointer. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArrayStream struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ - @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import RecordBatchReader from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - RecordBatchReader - """ - @classmethod - def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: - """ - Create RecordBatchReader from a Arrow-compatible stream object. - - This accepts objects implementing the Arrow PyCapsule Protocol for - streams, i.e. objects that have a ``__arrow_c_stream__`` method. - - Parameters - ---------- - data : Arrow-compatible stream object - Any object that implements the Arrow PyCapsule Protocol for - streams. - schema : Schema, default None - The schema to which the stream should be casted, if supported - by the stream object. - - Returns - ------- - RecordBatchReader - """ - @classmethod - def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: - """ - Create RecordBatchReader from an iterable of batches. - - Parameters - ---------- - schema : Schema - The shared schema of the record batches - batches : Iterable[RecordBatch] - The batches that this reader will return. - - Returns - ------- - reader : RecordBatchReader - """ - -class _RecordBatchStreamReader(RecordBatchReader): - @property - def stats(self) -> ReadStats: - """ - Current IPC read statistics. - """ - -class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... - -class RecordBatchWithMetadata(NamedTuple): - """RecordBatch with its custom metadata - - Parameters - ---------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - - batch: RecordBatch - custom_metadata: KeyValueMetadata - -class _RecordBatchFileReader(_Weakrefable): - @property - def num_record_batches(self) -> int: - """ - The number of record batches in the IPC file. - """ - def get_batch(self, i: int) -> RecordBatch: - """ - Read the record batch with the given index. - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - """ - get_record_batch = get_batch - def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: - """ - Read the record batch with the given index along with - its custom metadata - - Parameters - ---------- - i : int - The index of the record batch in the IPC file. - - Returns - ------- - batch : RecordBatch - custom_metadata : KeyValueMetadata - """ - def read_all(self) -> Table: - """ - Read all record batches as a pyarrow.Table - """ - read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_val, exc_tb): ... - @property - def schema(self) -> Schema: ... - @property - def stats(self) -> ReadStats: ... - -def get_tensor_size(tensor: Tensor) -> int: - """ - Return total size of serialized Tensor including metadata and padding. - - Parameters - ---------- - tensor : Tensor - The tensor for which we want to known the size. - """ - -def get_record_batch_size(batch: RecordBatch) -> int: - """ - Return total size of serialized RecordBatch including metadata and padding. - - Parameters - ---------- - batch : RecordBatch - The recordbatch for which we want to know the size. - """ - -def write_tensor(tensor: Tensor, dest: NativeFile) -> int: - """ - Write pyarrow.Tensor to pyarrow.NativeFile object its current position. - - Parameters - ---------- - tensor : pyarrow.Tensor - dest : pyarrow.NativeFile - - Returns - ------- - bytes_written : int - Total number of bytes written to the file - """ - -def read_tensor(source: NativeFile) -> Tensor: - """Read pyarrow.Tensor from pyarrow.NativeFile object from current - position. If the file source supports zero copy (e.g. a memory map), then - this operation does not allocate any memory. This function not assume that - the stream is aligned - - Parameters - ---------- - source : pyarrow.NativeFile - - Returns - ------- - tensor : Tensor - - """ - -def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: - """ - Read length-prefixed message from file or buffer-like object - - Parameters - ---------- - source : pyarrow.NativeFile, file-like object, or buffer-like object - - Returns - ------- - message : Message - """ - -def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: - """ - Read Schema from message or buffer - - Parameters - ---------- - obj : buffer or Message - dictionary_memo : DictionaryMemo, optional - Needed to be able to reconstruct dictionary-encoded fields - with read_record_batch - - Returns - ------- - schema : Schema - """ - -def read_record_batch( - obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None -) -> RecordBatch: - """ - Read RecordBatch from message, given a known schema. If reading data from a - complete IPC stream, use ipc.open_stream instead - - Parameters - ---------- - obj : Message or Buffer-like - schema : Schema - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - - Returns - ------- - batch : RecordBatch - """ - -__all__ = [ - "MetadataVersion", - "WriteStats", - "ReadStats", - "IpcReadOptions", - "IpcWriteOptions", - "Message", - "MessageReader", - "_CRecordBatchWriter", - "_RecordBatchStreamWriter", - "_ReadPandasMixin", - "RecordBatchReader", - "_RecordBatchStreamReader", - "_RecordBatchFileWriter", - "RecordBatchWithMetadata", - "_RecordBatchFileReader", - "get_tensor_size", - "get_record_batch_size", - "write_tensor", - "read_tensor", - "read_message", - "read_schema", - "read_record_batch", -] diff --git a/pyarrow-stubs/__lib_pxi/memory.pyi b/pyarrow-stubs/__lib_pxi/memory.pyi deleted file mode 100644 index 57a3bb4f1b3..00000000000 --- a/pyarrow-stubs/__lib_pxi/memory.pyi +++ /dev/null @@ -1,174 +0,0 @@ -from pyarrow.lib import _Weakrefable - -class MemoryPool(_Weakrefable): - """ - Base class for memory allocation. - - Besides tracking its number of allocated bytes, a memory pool also - takes care of the required 64-byte alignment for Arrow data. - """ - - def release_unused(self) -> None: - """ - Attempt to return to the OS any memory being held onto by the pool. - - This function should not be called except potentially for - benchmarking or debugging as it could be expensive and detrimental to - performance. - - This is best effort and may not have any effect on some memory pools - or in some situations (e.g. fragmentation). - """ - def bytes_allocated(self) -> int: - """ - Return the number of bytes that are currently allocated from this - memory pool. - """ - def total_bytes_allocated(self) -> int: - """ - Return the total number of bytes that have been allocated from this - memory pool. - """ - def max_memory(self) -> int | None: - """ - Return the peak memory allocation in this memory pool. - This can be an approximate number in multi-threaded applications. - - None is returned if the pool implementation doesn't know how to - compute this number. - """ - def num_allocations(self) -> int: - """ - Return the number of allocations or reallocations that were made - using this memory pool. - """ - def print_stats(self) -> None: - """ - Print statistics about this memory pool. - - The output format is implementation-specific. Not all memory pools - implement this method. - """ - @property - def backend_name(self) -> str: - """ - The name of the backend used by this MemoryPool (e.g. "jemalloc"). - """ - -class LoggingMemoryPool(MemoryPool): ... -class ProxyMemoryPool(MemoryPool): ... - -def default_memory_pool() -> MemoryPool: - """ - Return the process-global memory pool. - - Examples - -------- - >>> default_memory_pool() - - """ - -def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: - """ - Create and return a MemoryPool instance that redirects to the - *parent*, but with separate allocation statistics. - - Parameters - ---------- - parent : MemoryPool - The real memory pool that should be used for allocations. - """ - -def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: - """ - Create and return a MemoryPool instance that redirects to the - *parent*, but also dumps allocation logs on stderr. - - Parameters - ---------- - parent : MemoryPool - The real memory pool that should be used for allocations. - """ - -def system_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the C malloc heap. - """ - -def jemalloc_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the jemalloc heap. - - NotImplementedError is raised if jemalloc support is not enabled. - """ - -def mimalloc_memory_pool() -> MemoryPool: - """ - Return a memory pool based on the mimalloc heap. - - NotImplementedError is raised if mimalloc support is not enabled. - """ - -def set_memory_pool(pool: MemoryPool) -> None: - """ - Set the default memory pool. - - Parameters - ---------- - pool : MemoryPool - The memory pool that should be used by default. - """ - -def log_memory_allocations(enable: bool = True) -> None: - """ - Enable or disable memory allocator logging for debugging purposes - - Parameters - ---------- - enable : bool, default True - Pass False to disable logging - """ - -def total_allocated_bytes() -> int: - """ - Return the currently allocated bytes from the default memory pool. - Other memory pools may not be accounted for. - """ - -def jemalloc_set_decay_ms(decay_ms: int) -> None: - """ - Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of - milliseconds. A value of 0 (the default) results in dirty / muzzy memory - pages being released right away to the OS, while a higher value will result - in a time-based decay. See the jemalloc docs for more information - - It's best to set this at the start of your application. - - Parameters - ---------- - decay_ms : int - Number of milliseconds to set for jemalloc decay conf parameters. Note - that this change will only affect future memory arenas - """ - -def supported_memory_backends() -> list[str]: - """ - Return a list of available memory pool backends - """ - -__all__ = [ - "MemoryPool", - "LoggingMemoryPool", - "ProxyMemoryPool", - "default_memory_pool", - "proxy_memory_pool", - "logging_memory_pool", - "system_memory_pool", - "jemalloc_memory_pool", - "mimalloc_memory_pool", - "set_memory_pool", - "log_memory_allocations", - "total_allocated_bytes", - "jemalloc_set_decay_ms", - "supported_memory_backends", -] diff --git a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi b/pyarrow-stubs/__lib_pxi/pandas_shim.pyi deleted file mode 100644 index 0e80fae4ebf..00000000000 --- a/pyarrow-stubs/__lib_pxi/pandas_shim.pyi +++ /dev/null @@ -1,51 +0,0 @@ -from types import ModuleType -from typing import Any, Iterable, TypeGuard - -import pandas as pd - -from numpy import dtype -from pandas.core.dtypes.base import ExtensionDtype - -class _PandasAPIShim: - has_sparse: bool - - def series(self, *args, **kwargs) -> pd.Series: ... - def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... - @property - def have_pandas(self) -> bool: ... - @property - def compat(self) -> ModuleType: ... - @property - def pd(self) -> ModuleType: ... - def infer_dtype(self, obj: Iterable) -> str: ... - def pandas_dtype(self, dtype: str) -> dtype: ... - @property - def loose_version(self) -> Any: ... - @property - def version(self) -> str: ... - def is_v1(self) -> bool: ... - def is_ge_v21(self) -> bool: ... - def is_ge_v23(self) -> bool: ... - def is_ge_v3(self) -> bool: ... - @property - def categorical_type(self) -> type[pd.Categorical]: ... - @property - def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... - @property - def extension_dtype(self) -> type[ExtensionDtype]: ... - def is_array_like( - self, obj: Any - ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... - def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... - def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... - def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... - def is_sparse(self, obj: Any) -> bool: ... - def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... - def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... - def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... - def get_values(self, obj: Any) -> bool: ... - def get_rangeindex_attribute(self, level, name): ... - -_pandas_api: _PandasAPIShim - -__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/pyarrow-stubs/__lib_pxi/scalar.pyi b/pyarrow-stubs/__lib_pxi/scalar.pyi deleted file mode 100644 index 81ab5012067..00000000000 --- a/pyarrow-stubs/__lib_pxi/scalar.pyi +++ /dev/null @@ -1,1017 +0,0 @@ -import collections.abc -import datetime as dt -import sys - -from decimal import Decimal - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias -from typing import Any, Generic, Iterator, Literal, Mapping, overload - -import numpy as np - -from pyarrow._compute import CastOptions -from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable -from typing_extensions import Protocol, TypeVar - -from . import types -from .types import ( - _AsPyType, - _DataTypeT, - _Time32Unit, - _Time64Unit, - _Tz, - _Unit, -) - -_AsPyTypeK = TypeVar("_AsPyTypeK") -_AsPyTypeV = TypeVar("_AsPyTypeV") -_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) - -class Scalar(_Weakrefable, Generic[_DataType_co]): - """ - The base class for scalars. - """ - @property - def type(self) -> _DataType_co: - """ - Data type of the Scalar object. - """ - @property - def is_valid(self) -> bool: - """ - Holds a valid (non-null) value. - """ - @overload - def cast( - self, - target_type: None, - safe: bool = True, - options: CastOptions | None = None, - memory_pool: MemoryPool | None = None, - ) -> Self: ... - @overload - def cast( - self, - target_type: _DataTypeT, - safe: bool = True, - options: CastOptions | None = None, - memory_pool: MemoryPool | None = None, - ) -> Scalar[_DataTypeT]: ... - def cast(self, *args, **kwargs): - """ - Cast scalar value to another data type. - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, default None - Type to cast scalar to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Returns - ------- - scalar : A Scalar of the given target data type. - """ - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def equals(self, other: Scalar) -> bool: ... - def __hash__(self) -> int: ... - @overload - def as_py( - self: Scalar[types._BasicDataType[_AsPyType]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> _AsPyType: ... - @overload - def as_py( - self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType]: ... - @overload - def as_py( - self: Scalar[ - types.ListType[ - types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] - ] - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[int, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[ - types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[Any, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[int, Any]]: ... - @overload - def as_py( - self: Scalar[types.StructType], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[dict[str, Any]]: ... - @overload - def as_py( - self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] - ], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[Any, _AsPyTypeV]]: ... - @overload - def as_py( - self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[tuple[_AsPyTypeK, Any]]: ... - @overload - def as_py( - self: Scalar[Any], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> Any: ... - def as_py(self, *args, **kwargs): - """ - Return this value as a Python representation. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - """ - -_NULL: TypeAlias = None -NA = _NULL - -class NullScalar(Scalar[types.NullType]): ... -class BooleanScalar(Scalar[types.BoolType]): ... -class UInt8Scalar(Scalar[types.UInt8Type]): ... -class Int8Scalar(Scalar[types.Int8Type]): ... -class UInt16Scalar(Scalar[types.UInt16Type]): ... -class Int16Scalar(Scalar[types.Int16Type]): ... -class UInt32Scalar(Scalar[types.Uint32Type]): ... -class Int32Scalar(Scalar[types.Int32Type]): ... -class UInt64Scalar(Scalar[types.UInt64Type]): ... -class Int64Scalar(Scalar[types.Int64Type]): ... -class HalfFloatScalar(Scalar[types.Float16Type]): ... -class FloatScalar(Scalar[types.Float32Type]): ... -class DoubleScalar(Scalar[types.Float64Type]): ... -class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... -class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... -class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... -class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... -class Date32Scalar(Scalar[types.Date32Type]): ... - -class Date64Scalar(Scalar[types.Date64Type]): - @property - def value(self) -> dt.date | None: ... - -class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): - @property - def value(self) -> dt.time | None: ... - -class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): - @property - def value(self) -> dt.time | None: ... - -class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): - @property - def value(self) -> int | None: ... - -class DurationScalar(Scalar[types.DurationType[_Unit]]): - @property - def value(self) -> dt.timedelta | None: ... - -class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): - @property - def value(self) -> MonthDayNano | None: ... - -class BinaryScalar(Scalar[types.BinaryType]): - def as_buffer(self) -> Buffer: ... - -class LargeBinaryScalar(Scalar[types.LargeBinaryType]): - def as_buffer(self) -> Buffer: ... - -class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): - def as_buffer(self) -> Buffer: ... - -class StringScalar(Scalar[types.StringType]): - def as_buffer(self) -> Buffer: ... - -class LargeStringScalar(Scalar[types.LargeStringType]): - def as_buffer(self) -> Buffer: ... - -class BinaryViewScalar(Scalar[types.BinaryViewType]): - def as_buffer(self) -> Buffer: ... - -class StringViewScalar(Scalar[types.StringViewType]): - def as_buffer(self) -> Buffer: ... - -class ListScalar(Scalar[types.ListType[_DataTypeT]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... - -class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... - -class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... - -class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... - -class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... - def __iter__(self) -> Iterator[Array]: ... - -class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[str]: ... - def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] - def _as_py_tuple(self) -> list[tuple[str, Any]]: ... - -class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): - @property - def values(self) -> Array | None: ... - def __len__(self) -> int: ... - def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... - @overload - def __iter__( - self: Scalar[ - types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] - ], - ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... - @overload - def __iter__( - self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], - ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... - @overload - def __iter__( - self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], - ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... - -class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): - @property - def index(self) -> Scalar[types._IndexT]: ... - @property - def value(self) -> Scalar[types._BasicValueT]: ... - @property - def dictionary(self) -> Array: ... - -class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): - @property - def value(self) -> tuple[int, types._BasicValueT] | None: ... - -class UnionScalar(Scalar[types.UnionType]): - @property - def value(self) -> Any | None: ... - @property - def type_code(self) -> str: ... - -class ExtensionScalar(Scalar[types.ExtensionType]): - @property - def value(self) -> Any | None: ... - @staticmethod - def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: - """ - Construct ExtensionScalar from type and storage value. - - Parameters - ---------- - typ : DataType - The extension type for the result scalar. - value : object - The storage value for the result scalar. - - Returns - ------- - ext_scalar : ExtensionScalar - """ - -class Bool8Scalar(Scalar[types.Bool8Type]): ... -class UuidScalar(Scalar[types.UuidType]): ... -class JsonScalar(Scalar[types.JsonType]): ... -class OpaqueScalar(Scalar[types.OpaqueType]): ... - -class FixedShapeTensorScalar(ExtensionScalar): - def to_numpy(self) -> np.ndarray: - """ - Convert fixed shape tensor scalar to a numpy.ndarray. - - The resulting ndarray's shape matches the permuted shape of the - fixed shape tensor scalar. - The conversion is zero-copy. - - Returns - ------- - numpy.ndarray - """ - def to_tensor(self) -> Tensor: - """ - Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape - and strides derived from corresponding FixedShapeTensorType. - - The conversion is zero-copy. - - Returns - ------- - pyarrow.Tensor - Tensor represented stored in FixedShapeTensorScalar. - """ - -_V = TypeVar("_V") - -class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] - def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... - def __len__(self) -> int: ... - def __contains__(self, item: Any, /) -> bool: ... - -@overload -def scalar( - value: str, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringScalar: ... -@overload -def scalar( - value: bytes, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryScalar: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: bool, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BooleanScalar: ... -@overload -def scalar( - value: int, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int64Scalar: ... -@overload -def scalar( - value: float, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DoubleScalar: ... -@overload -def scalar( - value: Decimal, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal128Scalar: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: dt.datetime, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> TimestampScalar[Literal["us"]]: ... -@overload -def scalar( - value: dt.date, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date32Scalar: ... -@overload -def scalar( - value: dt.time, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time64Scalar[Literal["us"]]: ... -@overload -def scalar( - value: dt.timedelta, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DurationScalar[Literal["us"]]: ... -@overload -def scalar( # pyright: ignore[reportOverlappingOverload] - value: MonthDayNano, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalScalar: ... -@overload -def scalar( - value: Mapping[str, Any], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StructScalar: ... -@overload -def scalar( - value: NullableCollection[str], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.StringType]]: ... -@overload -def scalar( - value: NullableCollection[bytes], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.BinaryType]]: ... -@overload -def scalar( - value: NullableCollection[bool], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.BoolType]]: ... -@overload -def scalar( - value: NullableCollection[int], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Int64Type]]: ... -@overload -def scalar( - value: NullableCollection[float], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Float64Type]]: ... -@overload -def scalar( - value: NullableCollection[Decimal], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Decimal32Type]]: ... -@overload -def scalar( - value: NullableCollection[dt.datetime], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[dt.date], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Date32Type]]: ... -@overload -def scalar( - value: NullableCollection[dt.time], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[dt.timedelta], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... -@overload -def scalar( - value: NullableCollection[MonthDayNano], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... -@overload -def scalar( - value: NullableCollection[Any], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[Any]: ... -@overload -def scalar( - value: Any, - type: types.NullType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> NullScalar: ... -@overload -def scalar( - value: Any, - type: types.BoolType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BooleanScalar: ... -@overload -def scalar( - value: Any, - type: types.UInt8Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt8Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int8Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int8Scalar: ... -@overload -def scalar( - value: Any, - type: types.UInt16Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt16Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int16Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int16Scalar: ... -@overload -def scalar( - value: Any, - type: types.Uint32Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt32Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int32Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int32Scalar: ... -@overload -def scalar( - value: Any, - type: types.UInt64Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UInt64Scalar: ... -@overload -def scalar( - value: Any, - type: types.Int64Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Int64Scalar: ... -@overload -def scalar( - value: Any, - type: types.Float16Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> HalfFloatScalar: ... -@overload -def scalar( - value: Any, - type: types.Float32Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> FloatScalar: ... -@overload -def scalar( - value: Any, - type: types.Float64Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DoubleScalar: ... -@overload -def scalar( - value: Any, - type: types.Date32Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date32Scalar: ... -@overload -def scalar( - value: Any, - type: types.Date64Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Date64Scalar: ... -@overload -def scalar( - value: Any, - type: types.MonthDayNanoIntervalType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MonthDayNanoIntervalScalar: ... -@overload -def scalar( - value: Any, - type: types.StringType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringScalar: ... -@overload -def scalar( - value: Any, - type: types.LargeStringType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeStringScalar: ... -@overload -def scalar( - value: Any, - type: types.StringViewType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StringViewScalar: ... -@overload -def scalar( - value: Any, - type: types.BinaryType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryScalar: ... -@overload -def scalar( - value: Any, - type: types.LargeBinaryType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeBinaryScalar: ... -@overload -def scalar( - value: Any, - type: types.BinaryViewType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> BinaryViewScalar: ... -@overload -def scalar( - value: Any, - type: types.TimestampType[types._Unit, types._Tz], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> TimestampScalar[types._Unit, types._Tz]: ... -@overload -def scalar( - value: Any, - type: types.Time32Type[types._Time32Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time32Scalar[types._Time32Unit]: ... -@overload -def scalar( - value: Any, - type: types.Time64Type[types._Time64Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Time64Scalar[types._Time64Unit]: ... -@overload -def scalar( - value: Any, - type: types.DurationType[types._Unit], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DurationScalar[types._Unit]: ... -@overload -def scalar( - value: Any, - type: types.Decimal32Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal32Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal64Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal64Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal128Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal128Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.Decimal256Type[types._Precision, types._Scale], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Decimal256Scalar[types._Precision, types._Scale]: ... -@overload -def scalar( - value: Any, - type: types.ListType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.LargeListType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeListScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.ListViewType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> ListViewScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.LargeListViewType[_DataTypeT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> LargeListViewScalar[_DataTypeT]: ... -@overload -def scalar( - value: Any, - type: types.FixedSizeListType[_DataTypeT, types._Size], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... -@overload -def scalar( - value: Any, - type: types.DictionaryType[types._IndexT, types._BasicValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... -@overload -def scalar( - value: Any, - type: types.MapType[types._K, types._ValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> MapScalar[types._K, types._ValueT]: ... -@overload -def scalar( - value: Any, - type: types.StructType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> StructScalar: ... -@overload -def scalar( - value: Any, - type: types.UnionType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UnionScalar: ... -@overload -def scalar( - value: Any, - type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... -@overload -def scalar( - value: Any, - type: types.Bool8Type, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Bool8Scalar: ... -@overload -def scalar( - value: Any, - type: types.UuidType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> UuidScalar: ... -@overload -def scalar( - value: Any, - type: types.JsonType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> JsonScalar: ... -@overload -def scalar( - value: Any, - type: types.OpaqueType, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> OpaqueScalar: ... -@overload -def scalar( - value: Any, - type: _DataTypeT, - *, - from_pandas: bool | None = None, - memory_pool: MemoryPool | None = None, -) -> Scalar[_DataTypeT]: ... -def scalar(*args, **kwargs): - """ - Create a pyarrow.Scalar instance from a Python object. - - Parameters - ---------- - value : Any - Python object coercible to arrow's type system. - type : pyarrow.DataType - Explicit type to attempt to coerce to, otherwise will be inferred from - the value. - from_pandas : bool, default None - Use pandas's semantics for inferring nulls from values in - ndarray-like data. Defaults to False if not passed explicitly by user, - or True if a pandas object is passed in. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the currently-set default - memory pool. - - Returns - ------- - scalar : pyarrow.Scalar - - Examples - -------- - >>> import pyarrow as pa - - >>> pa.scalar(42) - - - >>> pa.scalar("string") - - - >>> pa.scalar([1, 2]) - - - >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) - - """ - -__all__ = [ - "Scalar", - "_NULL", - "NA", - "NullScalar", - "BooleanScalar", - "UInt8Scalar", - "Int8Scalar", - "UInt16Scalar", - "Int16Scalar", - "UInt32Scalar", - "Int32Scalar", - "UInt64Scalar", - "Int64Scalar", - "HalfFloatScalar", - "FloatScalar", - "DoubleScalar", - "Decimal32Scalar", - "Decimal64Scalar", - "Decimal128Scalar", - "Decimal256Scalar", - "Date32Scalar", - "Date64Scalar", - "Time32Scalar", - "Time64Scalar", - "TimestampScalar", - "DurationScalar", - "MonthDayNanoIntervalScalar", - "BinaryScalar", - "LargeBinaryScalar", - "FixedSizeBinaryScalar", - "StringScalar", - "LargeStringScalar", - "BinaryViewScalar", - "StringViewScalar", - "ListScalar", - "FixedSizeListScalar", - "LargeListScalar", - "ListViewScalar", - "LargeListViewScalar", - "StructScalar", - "MapScalar", - "DictionaryScalar", - "RunEndEncodedScalar", - "UnionScalar", - "ExtensionScalar", - "FixedShapeTensorScalar", - "Bool8Scalar", - "UuidScalar", - "JsonScalar", - "OpaqueScalar", - "scalar", -] diff --git a/pyarrow-stubs/__lib_pxi/table.pyi b/pyarrow-stubs/__lib_pxi/table.pyi deleted file mode 100644 index ad9d0392137..00000000000 --- a/pyarrow-stubs/__lib_pxi/table.pyi +++ /dev/null @@ -1,5609 +0,0 @@ -import datetime as dt -import sys - -from decimal import Decimal - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias -from typing import ( - Any, - Collection, - Generator, - Generic, - Iterable, - Iterator, - Literal, - Mapping, - Sequence, - TypeVar, - overload, -) - -import numpy as np -import pandas as pd - -from numpy.typing import NDArray -from pyarrow._compute import ( - CastOptions, - CountOptions, - FunctionOptions, - ScalarAggregateOptions, - TDigestOptions, - VarianceOptions, -) -from pyarrow._stubs_typing import ( - Indices, - Mask, - NullEncoding, - NullSelectionBehavior, - Order, - SupportArrowArray, - SupportArrowDeviceArray, - SupportArrowStream, -) -from pyarrow.compute import ArrayOrChunkedArray, Expression -from pyarrow.interchange.dataframe import _PyArrowDataFrame -from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema - -from . import array, scalar, types -from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible -from .device import DeviceAllocationType -from .io import Buffer -from .ipc import RecordBatchReader -from .scalar import Int64Scalar, Scalar -from .tensor import Tensor -from .types import _AsPyType, _BasicDataType, _DataTypeT - -_ScalarT = TypeVar("_ScalarT", bound=Scalar) -_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) - -_Aggregation: TypeAlias = Literal[ - "all", - "any", - "approximate_median", - "count", - "count_all", - "count_distinct", - "distinct", - "first", - "first_last", - "last", - "list", - "max", - "mean", - "min", - "min_max", - "one", - "product", - "stddev", - "sum", - "tdigest", - "variance", -] -_AggregationPrefixed: TypeAlias = Literal[ - "hash_all", - "hash_any", - "hash_approximate_median", - "hash_count", - "hash_count_all", - "hash_count_distinct", - "hash_distinct", - "hash_first", - "hash_first_last", - "hash_last", - "hash_list", - "hash_max", - "hash_mean", - "hash_min", - "hash_min_max", - "hash_one", - "hash_product", - "hash_stddev", - "hash_sum", - "hash_tdigest", - "hash_variance", -] -Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed -AggregateOptions: TypeAlias = ( - ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions -) - -UnarySelector: TypeAlias = str -NullarySelector: TypeAlias = tuple[()] -NarySelector: TypeAlias = list[str] | tuple[str, ...] -ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector - -class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): - """ - An array-like composed from a (possibly empty) collection of pyarrow.Arrays - - Warnings - -------- - Do not call this class's constructor directly. - - Examples - -------- - To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: - - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) - True - """ - - @property - def data(self) -> Self: ... - @property - def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: - """ - Return data type of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - """ - def length(self) -> int: - """ - Return length of a ChunkedArray. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.length() - 6 - """ - __len__ = length - def to_string( - self, - *, - indent: int = 0, - window: int = 5, - container_window: int = 2, - skip_new_lines: bool = False, - ) -> str: - """ - Render a "pretty-printed" string representation of the ChunkedArray - - Parameters - ---------- - indent : int - How much to indent right the content of the array, - by default ``0``. - window : int - How many items to preview within each chunk at the begin and end - of the chunk when the chunk is bigger than the window. - The other elements will be ellipsed. - container_window : int - How many chunks to preview at the begin and end - of the array when the array is bigger than the window. - The other elements will be ellipsed. - This setting also applies to list columns. - skip_new_lines : bool - If the array should be rendered as a single line of text - or if each element should be on its own line. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_string(skip_new_lines=True) - '[[2,2,4],[4,5,100]]' - """ - format = to_string - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - @property - def null_count(self) -> int: - """ - Number of null entries - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.null_count - 1 - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the chunked array. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.nbytes - 49 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the chunked array. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.get_total_buffer_size() - 49 - """ - def __sizeof__(self) -> int: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - @overload - def __getitem__(self, key: int) -> _Scalar_co: ... - def __getitem__(self, key): - """ - Slice or return value at given index - - Parameters - ---------- - key : integer or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - value : Scalar (index) or ChunkedArray (slice) - """ - def getitem(self, i: int) -> Scalar: ... - def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the null values. - - Parameters - ---------- - nan_is_null : bool (optional, default False) - Whether floating-point NaN values should also be considered null. - - Returns - ------- - array : boolean Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_null() - - [ - [ - false, - false, - false, - false, - true, - false - ] - ] - """ - def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the NaN values. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) - >>> arr.is_nan() - - [ - [ - false, - true, - false, - false, - null, - false - ] - ] - """ - def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: - """ - Return boolean array indicating the non-null values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.is_valid() - - [ - [ - true, - true, - true - ], - [ - true, - false, - true - ] - ] - """ - def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: - """ - Replace each null element in values with fill_value. - - See :func:`pyarrow.compute.fill_null` for full usage. - - Parameters - ---------- - fill_value : any - The replacement value for null entries. - - Returns - ------- - result : Array or ChunkedArray - A new array with nulls replaced by the given value. - - Examples - -------- - >>> import pyarrow as pa - >>> fill_value = pa.scalar(5, type=pa.int8()) - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.fill_null(fill_value) - - [ - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - ] - """ - def equals(self, other: Self) -> bool: - """ - Return whether the contents of two chunked arrays are equal. - - Parameters - ---------- - other : pyarrow.ChunkedArray - Chunked array to compare against. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> n_legs.equals(n_legs) - True - >>> n_legs.equals(animals) - False - """ - def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: - """ - Return a NumPy copy of this array (experimental). - - Parameters - ---------- - zero_copy_only : bool, default False - Introduced for signature consistence with pyarrow.Array.to_numpy. - This must be False here since NumPy arrays' buffer must be contiguous. - - Returns - ------- - array : numpy.ndarray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.to_numpy() - array([ 2, 2, 4, 4, 5, 100]) - """ - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - @overload - def cast( - self, - target_type: None = None, - safe: bool | None = None, - options: CastOptions | None = None, - ) -> Self: ... - @overload - def cast( - self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None - ) -> ChunkedArray[Scalar[_CastAs]]: ... - def cast(self, *args, **kwargs): - """ - Cast array values to another data type - - See :func:`pyarrow.compute.cast` for usage. - - Parameters - ---------- - target_type : DataType, None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Array or ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs.type - DataType(int64) - - Change the data type of an array: - - >>> n_legs_seconds = n_legs.cast(pa.duration("s")) - >>> n_legs_seconds.type - DurationType(duration[s]) - """ - def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: - """ - Compute dictionary-encoded representation of array. - - See :func:`pyarrow.compute.dictionary_encode` for full usage. - - Parameters - ---------- - null_encoding : str, default "mask" - How to handle null entries. - - Returns - ------- - encoded : ChunkedArray - A dictionary-encoded version of this array. - - Examples - -------- - >>> import pyarrow as pa - >>> animals = pa.chunked_array( - ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) - ... ) - >>> animals.dictionary_encode() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: - """ - Flatten this ChunkedArray. If it has a struct type, the column is - flattened into one array per struct field. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : list of ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> c_arr = pa.chunked_array(n_legs.value_counts()) - >>> c_arr - - [ - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - ] - >>> c_arr.flatten() - [ - [ - [ - 2, - 4, - 5, - 100 - ] - ], - [ - [ - 2, - 2, - 1, - 1 - ] - ]] - >>> c_arr.type - StructType(struct) - >>> n_legs.type - DataType(int64) - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: - """ - Flatten this ChunkedArray into a single non-chunked array. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.combine_chunks() - - [ - 2, - 2, - 4, - 4, - 5, - 100 - ] - """ - def unique(self) -> ChunkedArray[_Scalar_co]: - """ - Compute distinct elements in array - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.unique() - - [ - 2, - 4, - 5, - 100 - ] - """ - def value_counts(self) -> StructArray: - """ - Compute counts of unique elements in array. - - Returns - ------- - An array of structs - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.value_counts() - - -- is_valid: all not null - -- child 0 type: int64 - [ - 2, - 4, - 5, - 100 - ] - -- child 1 type: int64 - [ - 2, - 2, - 1, - 1 - ] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this ChunkedArray - - Parameters - ---------- - offset : int, default 0 - Offset from start of array to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.slice(2, 2) - - [ - [ - 4 - ], - [ - 4 - ] - ] - """ - def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.filter` for full usage. - - Parameters - ---------- - mask : Array or array-like - The boolean mask to filter the chunked array with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled. - - Returns - ------- - filtered : Array or ChunkedArray - An array of the same type, with only the elements selected by - the boolean mask. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> mask = pa.array([True, False, None, True, False, True]) - >>> n_legs.filter(mask) - - [ - [ - 2 - ], - [ - 4, - 100 - ] - ] - >>> n_legs.filter(mask, null_selection_behavior="emit_null") - - [ - [ - 2, - null - ], - [ - 4, - 100 - ] - ] - """ - @overload - def index( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], - value: Scalar[_DataTypeT] | _AsPyType, - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: ... - @overload - def index( - self, - value: Scalar[_DataTypeT], - start: int | None = None, - end: int | None = None, - *, - memory_pool: MemoryPool | None = None, - ) -> Int64Scalar: ... - def index(self, *args, **kwargs): - """ - Find the first index of a value. - - See :func:`pyarrow.compute.index` for full usage. - - Parameters - ---------- - value : Scalar or object - The value to look for in the array. - start : int, optional - The start index where to look for `value`. - end : int, optional - The end index where to look for `value`. - memory_pool : MemoryPool, optional - A memory pool for potential memory allocations. - - Returns - ------- - index : Int64Scalar - The index of the value in the array (-1 if not found). - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.index(4) - - >>> n_legs.index(4, start=3) - - """ - def take(self, indices: Indices) -> Self: - """ - Select values from the chunked array. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the array whose values will be returned. - - Returns - ------- - taken : Array or ChunkedArray - An array with the same datatype, containing the taken values. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.take([1, 4, 5]) - - [ - [ - 2, - 5, - 100 - ] - ] - """ - def drop_null(self) -> Self: - """ - Remove missing values from a chunked array. - See :func:`pyarrow.compute.drop_null` for full description. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.drop_null() - - [ - [ - 2, - 2 - ], - [ - 4, - 5, - 100 - ] - ] - """ - def sort(self, order: Order = "ascending", **kwargs) -> Self: - """ - Sort the ChunkedArray - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - result : ChunkedArray - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent chunked array, but where all - chunks share the same dictionary values. Dictionary indices are - transposed accordingly. - - If there are no dictionaries in the chunked array, it is returned - unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> c_arr - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ] - ] - >>> c_arr.unify_dictionaries() - - [ - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 0, - 1, - 2 - ], - ... - -- dictionary: - [ - "Flamingo", - "Parrot", - "Dog", - "Horse", - "Brittle stars", - "Centipede" - ] - -- indices: - [ - 3, - 4, - 5 - ] - ] - """ - @property - def num_chunks(self) -> int: - """ - Number of underlying chunks. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.num_chunks - 2 - """ - def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: - """ - Select a chunk by its index. - - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Array - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs.chunk(1) - - [ - 4, - 5, - 100 - ] - """ - @property - def chunks(self) -> list[Array[_Scalar_co]]: - """ - Convert to a list of single-chunked arrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) - >>> n_legs - - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ] - ] - >>> n_legs.chunks - [ - [ - 2, - 2, - null - ], - [ - 4, - 5, - 100 - ]] - """ - @overload - def iterchunks( - self: ChunkedArray[scalar.NullScalar], - ) -> Generator[array.NullArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BooleanScalar], - ) -> Generator[array.BooleanArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt8Scalar], - ) -> Generator[array.UInt8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int8Scalar], - ) -> Generator[array.Int8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt16Scalar], - ) -> Generator[array.UInt16Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int16Scalar], - ) -> Generator[array.Int16Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt32Scalar], - ) -> Generator[array.UInt32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int32Scalar], - ) -> Generator[array.Int32Array, None, None]: - """ - Convert to an iterator of ChunkArrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> for i in n_legs.iterchunks(): - ... print(i.null_count) - 0 - 1 - - """ - @overload - def iterchunks( - self: ChunkedArray[scalar.UInt64Scalar], - ) -> Generator[array.UInt64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Int64Scalar], - ) -> Generator[array.Int64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.HalfFloatScalar], - ) -> Generator[array.HalfFloatArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FloatScalar], - ) -> Generator[array.FloatArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DoubleScalar], - ) -> Generator[array.DoubleArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal32Scalar], - ) -> Generator[array.Decimal32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal64Scalar], - ) -> Generator[array.Decimal64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal128Scalar], - ) -> Generator[array.Decimal128Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Decimal256Scalar], - ) -> Generator[array.Decimal256Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Date32Scalar], - ) -> Generator[array.Date32Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Date64Scalar], - ) -> Generator[array.Date64Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], - ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], - ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DurationScalar[types._Unit]], - ) -> Generator[array.DurationArray[types._Unit], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], - ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BinaryScalar], - ) -> Generator[array.BinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeBinaryScalar], - ) -> Generator[array.LargeBinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FixedSizeBinaryScalar], - ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StringScalar], - ) -> Generator[array.StringArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeStringScalar], - ) -> Generator[array.LargeStringArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.BinaryViewScalar], - ) -> Generator[array.BinaryViewArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StringViewScalar], - ) -> Generator[array.StringViewArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.ListScalar[_DataTypeT]], - ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], - ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], - ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], - ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.StructScalar], - ) -> Generator[array.StructArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], - ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], - ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.RunEndEncodedScalar], - ) -> Generator[array.RunEndEncodedArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UnionScalar], - ) -> Generator[array.UnionArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.Bool8Scalar], - ) -> Generator[array.Bool8Array, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.UuidScalar], - ) -> Generator[array.UuidArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.JsonScalar], - ) -> Generator[array.JsonArray, None, None]: ... - @overload - def iterchunks( - self: ChunkedArray[scalar.OpaqueScalar], - ) -> Generator[array.OpaqueArray, None, None]: ... - def iterchunks(self): - """ - Convert to an iterator of ChunkArrays. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> for i in n_legs.iterchunks(): - ... print(i.null_count) - 0 - 1 - - """ - def __iter__(self) -> Iterator[_Scalar_co]: ... - def to_pylist( - self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], - *, - maps_as_pydicts: Literal["lossy", "strict"] | None = None, - ) -> list[_AsPyType | None]: - """ - Convert to a list of native Python objects. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) - >>> n_legs.to_pylist() - [2, 2, 4, 4, None, 100] - """ - def __arrow_c_stream__(self, requested_schema=None) -> Any: - """ - Export to a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - - Returns - ------- - PyCapsule - A capsule containing a C ArrowArrayStream struct. - """ - @classmethod - def _import_from_c_capsule(cls, stream) -> Self: - """ - Import ChunkedArray from a C ArrowArrayStream PyCapsule. - - Parameters - ---------- - stream: PyCapsule - A capsule containing a C ArrowArrayStream PyCapsule. - - Returns - ------- - ChunkedArray - """ - @property - def is_cpu(self) -> bool: - """ - Whether all chunks in the ChunkedArray are CPU-accessible. - """ - -@overload -def chunked_array( - values: Iterable[NullableCollection[bool]], - type: None = None, -) -> ChunkedArray[scalar.BooleanScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[int]], - type: None = None, -) -> ChunkedArray[scalar.Int64Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[float]], - type: None = None, -) -> ChunkedArray[scalar.DoubleScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[Decimal]], - type: None = None, -) -> ChunkedArray[scalar.Decimal128Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dict[str, Any]]], - type: None = None, -) -> ChunkedArray[scalar.StructScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.datetime]], - type: None = None, -) -> ChunkedArray[scalar.TimestampScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.date]], - type: None = None, -) -> ChunkedArray[scalar.Date32Scalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.time]], - type: None = None, -) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[dt.timedelta]], - type: None = None, -) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[MonthDayNano]], - type: None = None, -) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[str]], - type: None = None, -) -> ChunkedArray[scalar.StringScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[bytes]], - type: None = None, -) -> ChunkedArray[scalar.BinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[NullableCollection[list[Any]]], - type: None = None, -) -> ChunkedArray[scalar.ListScalar[Any]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["null"] | types.NullType, -) -> ChunkedArray[scalar.NullScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["bool", "boolean"] | types.BoolType, -) -> ChunkedArray[scalar.BooleanScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i1", "int8"] | types.Int8Type, -) -> ChunkedArray[scalar.Int8Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i2", "int16"] | types.Int16Type, -) -> ChunkedArray[scalar.Int16Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i4", "int32"] | types.Int32Type, -) -> ChunkedArray[scalar.Int32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["i8", "int64"] | types.Int64Type, -) -> ChunkedArray[scalar.Int64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u1", "uint8"] | types.UInt8Type, -) -> ChunkedArray[scalar.UInt8Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u2", "uint16"] | types.UInt16Type, -) -> ChunkedArray[scalar.UInt16Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u4", "uint32"] | types.Uint32Type, -) -> ChunkedArray[scalar.UInt32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["u8", "uint64"] | types.UInt64Type, -) -> ChunkedArray[scalar.UInt64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f2", "halffloat", "float16"] | types.Float16Type, -) -> ChunkedArray[scalar.HalfFloatScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f4", "float", "float32"] | types.Float32Type, -) -> ChunkedArray[scalar.FloatScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["f8", "double", "float64"] | types.Float64Type, -) -> ChunkedArray[scalar.DoubleScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string", "str", "utf8"] | types.StringType, -) -> ChunkedArray[scalar.StringScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary"] | types.BinaryType, -) -> ChunkedArray[scalar.BinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, -) -> ChunkedArray[scalar.LargeStringScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["large_binary"] | types.LargeBinaryType, -) -> ChunkedArray[scalar.LargeBinaryScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["binary_view"] | types.BinaryViewType, -) -> ChunkedArray[scalar.BinaryViewScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["string_view"] | types.StringViewType, -) -> ChunkedArray[scalar.StringViewScalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date32", "date32[day]"] | types.Date32Type, -) -> ChunkedArray[scalar.Date32Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["date64", "date64[ms]"] | types.Date64Type, -) -> ChunkedArray[scalar.Date64Scalar]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], -) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], -) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], -) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], -) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], -) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[s]"] | types.DurationType[Literal["s"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[us]"] | types.DurationType[Literal["us"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], - type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], -) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... -@overload -def chunked_array( - values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, - type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, -) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... -@overload -def chunked_array( - values: Iterable[Array[_ScalarT]], - type: None = None, -) -> ChunkedArray[_ScalarT]: ... -def chunked_array(value, type=None): - """ - Construct chunked array from list of array-like objects - - Parameters - ---------- - arrays : Array, list of Array, or array-like - Must all be the same data type. Can be empty only if type also passed. - Any Arrow-compatible array that implements the Arrow PyCapsule Protocol - (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be - passed as well. - type : DataType or string coercible to DataType - - Returns - ------- - ChunkedArray - - Examples - -------- - >>> import pyarrow as pa - >>> pa.chunked_array([], type=pa.int8()) - - [ - ... - ] - - >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - - [ - [ - 2, - 2, - 4 - ], - [ - 4, - 5, - 100 - ] - ] - """ - -_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) - -class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): - def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Return the dataframe interchange object implementing the interchange protocol. - - Parameters - ---------- - nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). - allow_copy : bool, default True - Whether to allow memory copying when exporting. If set to False - it would cause non-zero-copy exports to fail. - - Returns - ------- - DataFrame interchange object - The object which consuming library can use to ingress the dataframe. - - Notes - ----- - Details on the interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - """ - @overload - def __getitem__(self, key: int | str) -> _ColumnT: ... - @overload - def __getitem__(self, key: slice) -> Self: ... - def __getitem__(self, key): - """ - Slice or return column at given index or column name - - Parameters - ---------- - key : integer, str, or slice - Slices with step not equal to 1 (or None) will produce a copy - rather than a zero-copy view - - Returns - ------- - Array (from RecordBatch) or ChunkedArray (from Table) for column input. - RecordBatch or Table for slice input. - """ - def __len__(self) -> int: ... - def column(self, i: int | str) -> _ColumnT: - """ - Select single column from Table or RecordBatch. - - Parameters - ---------- - i : int or string - The index or name of the column to retrieve. - - Returns - ------- - column : Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Select a column by numeric index: - - >>> table.column(0) - - [ - [ - 2, - 4, - 5, - 100 - ] - ] - - Select a column by its name: - - >>> table.column("animals") - - [ - [ - "Flamingo", - "Horse", - "Brittle stars", - "Centipede" - ] - ] - """ - @property - def column_names(self) -> list[str]: - """ - Names of the Table or RecordBatch columns. - - Returns - ------- - list of str - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> table = pa.Table.from_arrays( - ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], - ... names=["n_legs", "animals"], - ... ) - >>> table.column_names - ['n_legs', 'animals'] - """ - @property - def columns(self) -> list[_ColumnT]: - """ - List of all columns in numerical order. - - Returns - ------- - columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.columns - [ - [ - [ - null, - 4, - 5, - null - ] - ], - [ - [ - "Flamingo", - "Horse", - null, - "Centipede" - ] - ]] - """ - def drop_null(self) -> Self: - """ - Remove rows that contain missing values from a Table or RecordBatch. - - See :func:`pyarrow.compute.drop_null` for full usage. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, with rows containing - no missing values. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [None, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", None, "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.drop_null() - pyarrow.Table - year: double - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ - def field(self, i: int | str) -> Field: - """ - Select a schema field by its column name or numeric index. - - Parameters - ---------- - i : int or string - The index or name of the field to retrieve. - - Returns - ------- - Field - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.field(0) - pyarrow.Field - >>> table.field(1) - pyarrow.Field - """ - @classmethod - def from_pydict( - cls, - mapping: Mapping[str, ArrayOrChunkedArray[Any] | list | np.ndarray], - schema: Schema | None = None, - metadata: Mapping | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from Arrow arrays or columns. - - Parameters - ---------- - mapping : dict or Mapping - A mapping of strings to Arrays or Python lists. - schema : Schema, default None - If not passed, will be inferred from the Mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> pydict = {"n_legs": n_legs, "animals": animals} - - Construct a Table from a dictionary of arrays: - - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a dictionary of arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pydict(pydict, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - @classmethod - def from_pylist( - cls, - mapping: Sequence[Mapping[str, Any]], - schema: Schema | None = None, - metadata: Mapping | None = None, - ) -> Self: - """ - Construct a Table or RecordBatch from list of rows / dictionaries. - - Parameters - ---------- - mapping : list of dicts of rows - A mapping of strings to row values. - schema : Schema, default None - If not passed, will be inferred from the first row of the - mapping values. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table or RecordBatch - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - - Construct a Table from a list of rows: - - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4]] - animals: [["Flamingo","Dog"]] - - Construct a Table from a list of rows with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - def itercolumns(self) -> Generator[_ColumnT, None, None]: - """ - Iterator over all columns in their numerical order. - - Yields - ------ - Array (for RecordBatch) or ChunkedArray (for Table) - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> for i in table.itercolumns(): - ... print(i.null_count) - 2 - 1 - """ - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def shape(self) -> tuple[int, int]: - """ - Dimensions of the table or record batch: (#rows, #columns). - - Returns - ------- - (int, int) - Number of rows and number of columns. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table.shape - (4, 2) - """ - @property - def schema(self) -> Schema: ... - @property - def nbytes(self) -> int: ... - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: - """ - Sort the Table or RecordBatch by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - Table or RecordBatch - A new tabular object sorted according to the sort keys. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.sort_by("animal") - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,2021,2021,2020,2022,2022]] - n_legs: [[5,100,4,2,4,2]] - animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] - """ - def take(self, indices: Indices) -> Self: - """ - Select rows from a Table or RecordBatch. - - See :func:`pyarrow.compute.take` for full usage. - - Parameters - ---------- - indices : Array or array-like - The indices in the tabular object whose rows will be returned. - - Returns - ------- - Table or RecordBatch - A tabular object with the same schema, containing the taken rows. - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.take([1, 3]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2022,2021]] - n_legs: [[4,100]] - animals: [["Horse","Centipede"]] - """ - def filter( - self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" - ) -> Self: - """ - Select rows from the table or record batch based on a boolean mask. - - The Table can be filtered based on a mask, which will be passed to - :func:`pyarrow.compute.filter` to perform the filtering, or it can - be filtered through a boolean :class:`.Expression` - - Parameters - ---------- - mask : Array or array-like or .Expression - The boolean mask or the :class:`.Expression` to filter the table with. - null_selection_behavior : str, default "drop" - How nulls in the mask should be handled, does nothing if - an :class:`.Expression` is used. - - Returns - ------- - filtered : Table or RecordBatch - A tabular object of the same schema, with only the rows selected - by applied filtering - - Examples - -------- - Using a Table (works similarly for RecordBatch): - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Define an expression and select rows: - - >>> import pyarrow.compute as pc - >>> expr = pc.field("year") <= 2020 - >>> table.filter(expr) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2019]] - n_legs: [[2,5]] - animals: [["Flamingo","Brittle stars"]] - - Define a mask and select rows: - - >>> mask = [True, True, False, None] - >>> table.filter(mask) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022]] - n_legs: [[2,4]] - animals: [["Flamingo","Horse"]] - >>> table.filter(mask, null_selection_behavior="emit_null") - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,null]] - n_legs: [[2,4,null]] - animals: [["Flamingo","Horse",null]] - """ - def to_pydict( - self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> dict[str, list]: - """ - Convert the Table or RecordBatch to a dict or OrderedDict. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - dict - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> table.to_pydict() - {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} - """ - def to_pylist( - self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None - ) -> list[dict[str, Any]]: - """ - Convert the Table or RecordBatch to a list of rows / dictionaries. - - Parameters - ---------- - maps_as_pydicts : str, optional, default `None` - Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - - If 'lossy', whenever duplicate keys are detected, a warning will be printed. - The last seen value of a duplicate key will be in the Python dictionary. - If 'strict', this instead results in an exception being raised when detected. - - Returns - ------- - list - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] - >>> table = pa.table(data, names=["n_legs", "animals"]) - >>> table.to_pylist() - [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... - """ - def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: - """ - Return human-readable string representation of Table or RecordBatch. - - Parameters - ---------- - show_metadata : bool, default False - Display Field-level and Schema-level KeyValueMetadata. - preview_cols : int, default 0 - Display values of the columns for the first N columns. - - Returns - ------- - str - """ - def remove_column(self, i: int) -> Self: ... - def drop_columns(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new Table or RecordBatch. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Raises - ------ - KeyError - If any of the passed column names do not exist. - - Returns - ------- - Table or RecordBatch - A tabular object without the column(s). - - Examples - -------- - Table (works similarly for RecordBatch) - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Drop one column: - - >>> table.drop_columns("animals") - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - - Drop one or more columns: - - >>> table.drop_columns(["n_legs", "animals"]) - pyarrow.Table - ... - ---- - """ - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: ... - def append_column(self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list) -> Self: - """ - Append column at end of columns. - - Parameters - ---------- - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - Table or RecordBatch - New table or record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Append column at the end: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.append_column("year", [year]) - pyarrow.Table - n_legs: int64 - animals: string - year: int64 - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - year: [[2021,2022,2019,2021]] - """ - -class RecordBatch(_Tabular[Array]): - """ - Batch of rows of columns of equal length - - Warnings - -------- - Do not call this class's constructor directly, use one of the - ``RecordBatch.from_*`` functions instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Constructing a RecordBatch from arrays: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Constructing a RecordBatch from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_pandas(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Constructing a RecordBatch from pylist: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] - >>> pa.RecordBatch.from_pylist(pylist).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Dog - - You can also construct a RecordBatch using :func:`pyarrow.record_batch`: - - >>> pa.record_batch([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - - def validate(self, *, full: bool = False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def replace_schema_metadata(self, metadata: dict | None = None) -> Self: - """ - Create shallow copy of record batch by replacing schema - key-value metadata with the indicated new metadata (which may be None, - which deletes any existing metadata - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - shallow_copy : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - - Constructing a RecordBatch with schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) - >>> batch.schema - n_legs: int64 - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Shallow copy of a RecordBatch with deleted schema metadata: - - >>> batch.replace_schema_metadata().schema - n_legs: int64 - """ - @property - def num_columns(self) -> int: - """ - Number of columns - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_columns - 2 - """ - - @property - def num_rows(self) -> int: - """ - Number of rows - - Due to the definition of a RecordBatch, all columns have the same - number of rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.num_rows - 6 - """ - @property - def schema(self) -> Schema: - """ - Schema of the RecordBatch and its columns - - Returns - ------- - pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.schema - n_legs: int64 - animals: string - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the record batch. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.nbytes - 116 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the record batch - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.get_total_buffer_size() - 120 - """ - - def __sizeof__(self) -> int: ... - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: - """ - Add column to RecordBatch at position i. - - A new record batch is returned with the column added, the original record batch - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.add_column(0, "year", year) - pyarrow.RecordBatch - year: int64 - n_legs: int64 - animals: string - ---- - year: [2021,2022,2019,2021] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Original record batch is left unchanged: - - >>> batch - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def remove_column(self, i: int) -> Self: - """ - Create new RecordBatch with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New record batch without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.remove_column(1) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ - def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: - """ - Replace column in RecordBatch at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array or value coercible to array - Column data. - - Returns - ------- - RecordBatch - New record batch with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> batch.set_column(1, "year", year) - pyarrow.RecordBatch - n_legs: int64 - year: int64 - ---- - n_legs: [2,4,5,100] - year: [2021,2022,2019,2021] - """ - @overload - def rename_columns(self, names: list[str]) -> Self: ... - @overload - def rename_columns(self, names: dict[str, str]) -> Self: ... - def rename_columns(self, names): - """ - Create new record batch with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> new_names = ["n", "name"] - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> batch.rename_columns(new_names) - pyarrow.RecordBatch - n: int64 - name: string - ---- - n: [2,4,5,100] - name: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write RecordBatch to Buffer as encapsulated IPC message, which does not - include a Schema. - - To reconstruct a RecordBatch from the encapsulated IPC message Buffer - returned by this function, a Schema must be passed separately. See - Examples. - - Parameters - ---------- - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> buf = batch.serialize() - >>> buf - - - Reconstruct RecordBatch from IPC message Buffer and original Schema - - >>> pa.ipc.read_record_batch(buf, batch.schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - """ - def slice(self, offset: int = 0, length: int | None = None) -> Self: - """ - Compute zero-copy slice of this RecordBatch - - Parameters - ---------- - offset : int, default 0 - Offset from start of record batch to slice - length : int, default None - Length of slice (default is until end of batch starting from - offset) - - Returns - ------- - sliced : RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> batch.slice(offset=3).to_pandas() - n_legs animals - 0 4 Horse - 1 5 Brittle stars - 2 100 Centipede - >>> batch.slice(length=2).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - >>> batch.slice(offset=3, length=1).to_pandas() - n_legs animals - 0 4 Horse - """ - def equals(self, other: Self, check_metadata: bool = False) -> bool: - """ - Check if contents of two record batches are equal. - - Parameters - ---------- - other : pyarrow.RecordBatch - RecordBatch to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - are_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) - >>> batch_0 = pa.record_batch([]) - >>> batch_1 = pa.RecordBatch.from_arrays( - ... [n_legs, animals], - ... names=["n_legs", "animals"], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> batch.equals(batch) - True - >>> batch.equals(batch_0) - False - >>> batch.equals(batch_1) - True - >>> batch.equals(batch_1, check_metadata=True) - False - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the RecordBatch. - - Returns a new RecordBatch with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) - - Select columns my indices: - - >>> batch.select([1]) - pyarrow.RecordBatch - animals: string - ---- - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Select columns by names: - - >>> batch.select(["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,2,4,4,5,100] - """ - def cast( - self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast record batch values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.RecordBatch.from_pandas(df) - >>> batch.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast batch values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> batch.cast(target_schema=my_schema) - pyarrow.RecordBatch - n_legs: duration[s] - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - """ - @classmethod - def from_arrays( - cls, - arrays: Collection[Array], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping | None = None, - ) -> Self: - """ - Construct a RecordBatch from multiple pyarrow.Arrays - - Parameters - ---------- - arrays : list of pyarrow.Array - One for each field in RecordBatch - names : list of str, optional - Names for the batch fields. If not passed, schema must be passed - schema : Schema, default None - Schema for the created batch. If not passed, names must be passed - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from pyarrow Arrays using names: - - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Construct a RecordBatch from pyarrow Arrays using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = None, - preserve_index: bool | None = None, - nthreads: int | None = None, - columns: list[str] | None = None, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow RecordBatch - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the RecordBatch. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``RecordBatch``. The default of None will store the index as a - column, except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - - Returns - ------- - pyarrow.RecordBatch - - - Examples - -------- - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Convert pandas DataFrame to RecordBatch: - - >>> import pyarrow as pa - >>> pa.RecordBatch.from_pandas(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch using schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.RecordBatch.from_pandas(df, schema=my_schema) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - Convert pandas DataFrame to RecordBatch specifying columns: - - >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) - pyarrow.RecordBatch - n_legs: int64 - ---- - n_legs: [2,4,5,100] - """ - @classmethod - def from_struct_array( - cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: - """ - Construct a RecordBatch from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``RecordBatch``. - - Parameters - ---------- - struct_array : StructArray - Array to construct the record batch from. - - Returns - ------- - pyarrow.RecordBatch - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.RecordBatch.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ - def to_struct_array(self) -> StructArray: - """ - Convert to a struct array. - """ - def to_tensor( - self, - null_to_nan: bool = False, - row_major: bool = True, - memory_pool: MemoryPool | None = None, - ) -> Tensor: - """ - Convert to a :class:`~pyarrow.Tensor`. - - RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths. - - ``null_to_nan`` is ``False`` by default and this method will raise an error in case - any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` - set to ``True``. In this case null values are converted to ``NaN`` and integer type - arrays are promoted to the appropriate float type. - - Parameters - ---------- - null_to_nan : bool, default False - Whether to write null values in the result as ``NaN``. - row_major : bool, default True - Whether resulting Tensor is row-major or column-major - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Examples - -------- - >>> import pyarrow as pa - >>> batch = pa.record_batch( - ... [ - ... pa.array([1, 2, 3, 4, None], type=pa.int32()), - ... pa.array([10, 20, 30, 40, None], type=pa.float32()), - ... ], - ... names=["a", "b"], - ... ) - - >>> batch - pyarrow.RecordBatch - a: int32 - b: float - ---- - a: [1,2,3,4,null] - b: [10,20,30,40,null] - - Convert a RecordBatch to row-major Tensor with null values - written as ``NaN``s - - >>> batch.to_tensor(null_to_nan=True) - - type: double - shape: (5, 2) - strides: (16, 8) - >>> batch.to_tensor(null_to_nan=True).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - - Convert a RecordBatch to column-major Tensor - - >>> batch.to_tensor(null_to_nan=True, row_major=False) - - type: double - shape: (5, 2) - strides: (8, 40) - >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() - array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) - """ - def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): - """ - Export to a C ArrowArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_array__(self, requested_schema=None): - """ - Get a pair of PyCapsules containing a C ArrowArray representation of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this schema. - If None, the batch will be returned as-is, with a schema matching the - one returned by :meth:`__arrow_c_schema__()`. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowArray, - respectively. - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the batch as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ - @classmethod - def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema - and ArrowArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ - def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: - """ - Export to a C ArrowDeviceArray struct, given its pointer. - - If a C ArrowSchema struct pointer is also given, the record batch - schema is exported to it at the same time. - - Parameters - ---------- - out_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - out_schema_ptr: int (optional) - The raw pointer to a C ArrowSchema struct. - - Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, - array memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: - """ - Import RecordBatch from a C ArrowDeviceArray struct, given its pointer - and the imported schema. - - Parameters - ---------- - in_ptr: int - The raw pointer to a C ArrowDeviceArray struct. - type: Schema or int - Either a Schema object, or the raw pointer to a C ArrowSchema - struct. - - This is a low-level function intended for expert users. - """ - def __arrow_c_device_array__(self, requested_schema=None, **kwargs): - """ - Get a pair of PyCapsules containing a C ArrowDeviceArray representation - of the object. - - Parameters - ---------- - requested_schema : PyCapsule | None - A PyCapsule containing a C ArrowSchema representation of a requested - schema. PyArrow will attempt to cast the batch to this data type. - If None, the batch will be returned as-is, with a type matching the - one returned by :meth:`__arrow_c_schema__()`. - kwargs - Currently no additional keyword arguments are supported, but - this method will accept any keyword with a value of ``None`` - for compatibility with future keywords. - - Returns - ------- - Tuple[PyCapsule, PyCapsule] - A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, - respectively. - """ - @classmethod - def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: - """ - Import RecordBatch from a pair of PyCapsules containing a - C ArrowSchema and ArrowDeviceArray, respectively. - - Parameters - ---------- - schema_capsule : PyCapsule - A PyCapsule containing a C ArrowSchema representation of the schema. - array_capsule : PyCapsule - A PyCapsule containing a C ArrowDeviceArray representation of the array. - - Returns - ------- - pyarrow.RecordBatch - """ - @property - def device_type(self) -> DeviceAllocationType: - """ - The device type where the arrays in the RecordBatch reside. - - Returns - ------- - DeviceAllocationType - """ - @property - def is_cpu(self) -> bool: - """ - Whether the RecordBatch's arrays are CPU-accessible. - """ - def copy_to(self, destination: MemoryManager | Device) -> Self: - """ - Copy the entire RecordBatch to destination device. - - This copies each column of the record batch to create - a new record batch where all underlying buffers for the columns have - been copied to the destination MemoryManager. - - Parameters - ---------- - destination : pyarrow.MemoryManager or pyarrow.Device - The destination device to copy the array to. - - Returns - ------- - RecordBatch - """ - -def table_to_blocks(options, table: Table, categories, extension_columns): ... - -JoinType: TypeAlias = Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", -] - -class Table(_Tabular[ChunkedArray[Any]]): - """ - A collection of top-level named, equal length Arrow arrays. - - Warnings - -------- - Do not call this class's constructor directly, use one of the ``from_*`` - methods instead. - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a RecordBatch: - - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a dictionary of arrays: - - >>> pydict = {"n_legs": n_legs, "animals": animals} - >>> pa.Table.from_pydict(pydict) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_pydict(pydict).schema - n_legs: int64 - animals: string - - Construct a Table from a dictionary of arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from a list of rows: - - >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] - >>> pa.Table.from_pylist(pylist) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,null]] - animals: [["Flamingo","Centipede"]] - - Construct a Table from a list of rows with pyarrow schema: - - >>> my_schema = pa.schema( - ... [ - ... pa.field("year", pa.int64()), - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... ], - ... metadata={"year": "Year of entry"}, - ... ) - >>> pa.Table.from_pylist(pylist, schema=my_schema).schema - year: int64 - n_legs: int64 - animals: string - -- schema metadata -- - year: 'Year of entry' - - Construct a Table with :func:`pyarrow.table`: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - - def validate(self, *, full=False) -> None: - """ - Perform validation checks. An exception is raised if validation fails. - - By default only cheap validation checks are run. Pass `full=True` - for thorough validation checks (potentially O(n)). - - Parameters - ---------- - full : bool, default False - If True, run expensive checks, otherwise cheap checks only. - - Raises - ------ - ArrowInvalid - """ - def slice(self, offset=0, length=None) -> Self: - """ - Compute zero-copy slice of this Table. - - Parameters - ---------- - offset : int, default 0 - Offset from start of table to slice. - length : int, default None - Length of slice (default is until end of table starting from - offset). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.slice(length=3) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019]] - n_legs: [[2,4,5]] - animals: [["Flamingo","Horse","Brittle stars"]] - >>> table.slice(offset=2) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019,2021]] - n_legs: [[5,100]] - animals: [["Brittle stars","Centipede"]] - >>> table.slice(offset=2, length=1) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2019]] - n_legs: [[5]] - animals: [["Brittle stars"]] - """ - def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: - """ - Select columns of the Table. - - Returns a new Table with the specified columns, and metadata - preserved. - - Parameters - ---------- - columns : list-like - The column names or integer indices to select. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.select([0, 1]) - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - >>> table.select(["year"]) - pyarrow.Table - year: int64 - ---- - year: [[2020,2022,2019,2021]] - """ - def replace_schema_metadata(self, metadata: dict | None = None) -> Self: - """ - Create shallow copy of table by replacing schema - key-value metadata with the indicated new metadata (which may be None), - which deletes any existing metadata. - - Parameters - ---------- - metadata : dict, default None - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Constructing a Table with pyarrow schema and metadata: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> table = pa.table(df, my_schema) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - - Create a shallow copy of a Table with deleted schema metadata: - - >>> table.replace_schema_metadata().schema - n_legs: int64 - animals: string - - Create a shallow copy of a Table with new schema metadata: - - >>> metadata = {"animals": "Which animal"} - >>> table.replace_schema_metadata(metadata=metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Which animal' - """ - def flatten(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Flatten this Table. - - Each column with a struct type is flattened - into one column per struct field. Other columns are left unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> month = pa.array([4, 6]) - >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) - >>> table - pyarrow.Table - a: struct - child 0, animals: string - child 1, n_legs: int64 - child 2, year: int64 - month: int64 - ---- - a: [ - -- is_valid: all not null - -- child 0 type: string - ["Parrot",null] - -- child 1 type: int64 - [2,4] - -- child 2 type: int64 - [null,2022]] - month: [[4,6]] - - Flatten the columns with struct field: - - >>> table.flatten() - pyarrow.Table - a.animals: string - a.n_legs: int64 - a.year: int64 - month: int64 - ---- - a.animals: [["Parrot",null]] - a.n_legs: [[2,4]] - a.year: [[null,2022]] - month: [[4,6]] - """ - def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Make a new table by combining the chunks this table has. - - All the underlying chunks in the ChunkedArray of each column are - concatenated into zero or one chunk. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - >>> table.combine_chunks() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4,4,5,100]] - animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - """ - def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: - """ - Unify dictionaries across all chunks. - - This method returns an equivalent table, but where all chunks of - each column share the same dictionary values. Dictionary indices - are transposed accordingly. - - Columns without dictionaries are returned unchanged. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() - >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() - >>> c_arr = pa.chunked_array([arr_1, arr_2]) - >>> table = pa.table([c_arr], names=["animals"]) - >>> table - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog"] -- indices: - [0,1,2], -- dictionary: - ["Horse","Brittle stars","Centipede"] -- indices: - [0,1,2]] - - Unify dictionaries across both chunks: - - >>> table.unify_dictionaries() - pyarrow.Table - animals: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [0,1,2], -- dictionary: - ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: - [3,4,5]] - """ - def equals(self, other: Self, check_metadata: bool = False) -> Self: - """ - Check if contents of two tables are equal. - - Parameters - ---------- - other : pyarrow.Table - Table to compare against. - check_metadata : bool, default False - Whether schema metadata equality should be checked as well. - - Returns - ------- - bool - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array( - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] - ... ) - >>> names = ["n_legs", "animals"] - >>> table = pa.Table.from_arrays([n_legs, animals], names=names) - >>> table_0 = pa.Table.from_arrays([]) - >>> table_1 = pa.Table.from_arrays( - ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} - ... ) - >>> table.equals(table) - True - >>> table.equals(table_0) - False - >>> table.equals(table_1) - True - >>> table.equals(table_1, check_metadata=True) - False - """ - def cast( - self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None - ) -> Self: - """ - Cast table values to another schema. - - Parameters - ---------- - target_schema : Schema - Schema to cast to, the names and order of fields must match. - safe : bool, default True - Check for overflows or other unsafe conversions. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - - Define new schema and cast table values: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] - ... ) - >>> table.cast(target_schema=my_schema) - pyarrow.Table - n_legs: duration[s] - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @classmethod - def from_pandas( - cls, - df: pd.DataFrame, - schema: Schema | None = None, - preserve_index: bool | None = None, - nthreads: int | None = None, - columns: list[str] | None = None, - safe: bool = True, - ) -> Self: - """ - Convert pandas.DataFrame to an Arrow Table. - - The column types in the resulting Arrow Table are inferred from the - dtypes of the pandas.Series in the DataFrame. In the case of non-object - Series, the NumPy dtype is translated to its Arrow equivalent. In the - case of `object`, we need to guess the datatype by looking at the - Python objects in this Series. - - Be aware that Series of the `object` dtype don't carry enough - information to always lead to a meaningful Arrow type. In the case that - we cannot infer a type, e.g. because the DataFrame is of length 0 or - the Series only contains None/nan objects, the type is set to - null. This behavior can be avoided by constructing an explicit schema - and passing it to this function. - - Parameters - ---------- - df : pandas.DataFrame - schema : pyarrow.Schema, optional - The expected schema of the Arrow Table. This can be used to - indicate the type of columns if we cannot infer it automatically. - If passed, the output will have exactly this schema. Columns - specified in the schema that are not found in the DataFrame columns - or its index will raise an error. Additional columns or index - levels in the DataFrame which are not specified in the schema will - be ignored. - preserve_index : bool, optional - Whether to store the index as an additional column in the resulting - ``Table``. The default of None will store the index as a column, - except for RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None - If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. By default, this follows - :func:`pyarrow.cpu_count` (may use up to system CPU count threads). - columns : list, optional - List of column to be converted. If None, use all columns. - safe : bool, default True - Check for overflows or other unsafe conversions. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.Table.from_pandas(df) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @classmethod - def from_arrays( - cls, - arrays: Collection[ArrayOrChunkedArray[Any]], - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping | None = None, - ) -> Self: - """ - Construct a Table from Arrow arrays. - - Parameters - ---------- - arrays : list of pyarrow.Array or pyarrow.ChunkedArray - Equal-length arrays that should form the table. - names : list of str, optional - Names for the table columns. If not passed, schema must be passed. - schema : Schema, default None - Schema for the created table. If not passed, names must be passed. - metadata : dict or Mapping, default None - Optional metadata for the schema (if inferred). - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from arrays: - - >>> pa.Table.from_arrays([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from arrays with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"animals": "Name of the animal species"}, - ... ) - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - animals: 'Name of the animal species' - """ - @classmethod - def from_struct_array( - cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] - ) -> Self: - """ - Construct a Table from a StructArray. - - Each field in the StructArray will become a column in the resulting - ``Table``. - - Parameters - ---------- - struct_array : StructArray or ChunkedArray - Array to construct the table from. - - Returns - ------- - pyarrow.Table - - Examples - -------- - >>> import pyarrow as pa - >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) - >>> pa.Table.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 - """ - def to_struct_array( - self, max_chunksize: int | None = None - ) -> ChunkedArray[scalar.StructScalar]: - """ - Convert to a chunked array of struct type. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for ChunkedArray chunks. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - ChunkedArray - """ - @classmethod - def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: - """ - Construct a Table from a sequence or iterator of Arrow RecordBatches. - - Parameters - ---------- - batches : sequence or iterator of RecordBatch - Sequence of RecordBatch to be converted, all schemas must be equal. - schema : Schema, default None - If not passed, will be inferred from the first RecordBatch. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - >>> batch = pa.record_batch([n_legs, animals], names=names) - >>> batch.to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Construct a Table from a RecordBatch: - - >>> pa.Table.from_batches([batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from a sequence of RecordBatches: - - >>> pa.Table.from_batches([batch, batch]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: - """ - Convert Table to a list of RecordBatch objects. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - list[RecordBatch] - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatch: - - >>> table.to_batches()[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - - Convert a Table to a list of RecordBatches: - - >>> table.to_batches(max_chunksize=2)[0].to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - >>> table.to_batches(max_chunksize=2)[1].to_pandas() - n_legs animals - 0 5 Brittle stars - 1 100 Centipede - """ - def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: - """ - Convert the Table to a RecordBatchReader. - - Note that this method is zero-copy, it merely exposes the same data - under a different API. - - Parameters - ---------- - max_chunksize : int, default None - Maximum number of rows for each RecordBatch chunk. Individual chunks - may be smaller depending on the chunk layout of individual columns. - - Returns - ------- - RecordBatchReader - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Convert a Table to a RecordBatchReader: - - >>> table.to_reader() - - - >>> reader = table.to_reader() - >>> reader.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... - >>> reader.read_all() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - @property - def schema(self) -> Schema: - """ - Schema of the table and its columns. - - Returns - ------- - Schema - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.schema - n_legs: int64 - animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... - """ - @property - def num_columns(self) -> int: - """ - Number of columns in this table. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_columns - 2 - """ - @property - def num_rows(self) -> int: - """ - Number of rows in this table. - - Due to the definition of a table, all columns have the same number of - rows. - - Returns - ------- - int - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.num_rows - 4 - """ - @property - def nbytes(self) -> int: - """ - Total number of bytes consumed by the elements of the table. - - In other words, the sum of bytes from all buffer ranges referenced. - - Unlike `get_total_buffer_size` this method will account for array - offsets. - - If buffers are shared between arrays then the shared - portion will only be counted multiple times. - - The dictionary of dictionary arrays will always be counted in their - entirety even if the array only references a portion of the dictionary. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.nbytes - 72 - """ - def get_total_buffer_size(self) -> int: - """ - The sum of bytes in each buffer referenced by the table. - - An array may only reference a portion of a buffer. - This method will overestimate in this case and return the - byte size of the entire buffer. - - If a buffer is referenced multiple times then it will - only be counted once. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.get_total_buffer_size() - 76 - """ - def __sizeof__(self) -> int: ... - def add_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: - """ - Add column to Table at position. - - A new table is returned with the column added, the original table - object is left unchanged. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column added. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Add column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.add_column(0, "year", [year]) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2021,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Original table is left unchanged: - - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def remove_column(self, i: int) -> Self: - """ - Create new Table with the indicated column removed. - - Parameters - ---------- - i : int - Index of column to remove. - - Returns - ------- - Table - New table without the column. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.remove_column(1) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,4,5,100]] - """ - def set_column( - self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list - ) -> Self: - """ - Replace column in Table at position. - - Parameters - ---------- - i : int - Index to place the column at. - field_ : str or Field - If a string is passed then the type is deduced from the column - data. - column : Array, list of Array, or values coercible to arrays - Column data. - - Returns - ------- - Table - New table with the passed column set. - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - - Replace a column: - - >>> year = [2021, 2022, 2019, 2021] - >>> table.set_column(1, "year", [year]) - pyarrow.Table - n_legs: int64 - year: int64 - ---- - n_legs: [[2,4,5,100]] - year: [[2021,2022,2019,2021]] - """ - @overload - def rename_columns(self, names: list[str]) -> Self: ... - @overload - def rename_columns(self, names: dict[str, str]) -> Self: ... - def rename_columns(self, names): - """ - Create new table with columns renamed to provided names. - - Parameters - ---------- - names : list[str] or dict[str, str] - List of new column names or mapping of old column names to new column names. - - If a mapping of old to new column names is passed, then all columns which are - found to match a provided old column name will be renamed to the new column name. - If any column names are not found in the mapping, a KeyError will be raised. - - Raises - ------ - KeyError - If any of the column names passed in the names mapping do not exist. - - Returns - ------- - Table - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> new_names = ["n", "name"] - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - >>> new_names = {"n_legs": "n", "animals": "name"} - >>> table.rename_columns(new_names) - pyarrow.Table - n: int64 - name: string - ---- - n: [[2,4,5,100]] - name: [["Flamingo","Horse","Brittle stars","Centipede"]] - """ - def drop(self, columns: str | list[str]) -> Self: - """ - Drop one or more columns and return a new table. - - Alias of Table.drop_columns, but kept for backwards compatibility. - - Parameters - ---------- - columns : str or list[str] - Field name(s) referencing existing column(s). - - Returns - ------- - Table - New table without the column(s). - """ - def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: - """ - Declare a grouping over the columns of the table. - - Resulting grouping can then be used to perform aggregations - with a subsequent ``aggregate()`` method. - - Parameters - ---------- - keys : str or list[str] - Name of the columns that should be used as the grouping key. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the - default), no stable ordering of the output is guaranteed. - - Returns - ------- - TableGroupBy - - See Also - -------- - TableGroupBy.aggregate - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> table.group_by("year").aggregate([("n_legs", "sum")]) - pyarrow.Table - year: int64 - n_legs_sum: int64 - ---- - year: [[2020,2022,2021,2019]] - n_legs_sum: [[2,6,104,5]] - """ - def join( - self, - right_table: Self, - keys: str | list[str], - right_keys: str | list[str] | None = None, - join_type: JoinType = "left outer", - left_suffix: str | None = None, - right_suffix: str | None = None, - coalesce_keys: bool = True, - use_threads: bool = True, - ) -> Self: - """ - Perform a join between this table and another one. - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - keys : str or list[str] - The columns from current table that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to left column names. This prevents confusion - when the columns in left and right tables have colliding names. - right_suffix : str, default None - Which suffix to add to the right column names. This prevents confusion - when the columns in left and right tables have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whether to use multithreading or not. - - Returns - ------- - Table - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) - >>> df2 = pd.DataFrame( - ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} - ... ) - >>> t1 = pa.Table.from_pandas(df1) - >>> t2 = pa.Table.from_pandas(df2) - - Left outer join: - - >>> t1.join(t2, "id").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2]] - year: [[2019,2020,2022]] - n_legs: [[5,null,null]] - animal: [["Brittle stars",null,null]] - - Full outer join: - - >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[3,1,2,4]] - year: [[2019,2020,2022,null]] - n_legs: [[5,null,null,100]] - animal: [["Brittle stars",null,null,"Centipede"]] - - Right outer join: - - >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") - pyarrow.Table - year: int64 - id: int64 - n_legs: int64 - animal: string - ---- - year: [[2019,null]] - id: [[3,4]] - n_legs: [[5,100]] - animal: [["Brittle stars","Centipede"]] - - Right anti join - - >>> t1.join(t2, "id", join_type="right anti") - pyarrow.Table - id: int64 - n_legs: int64 - animal: string - ---- - id: [[4]] - n_legs: [[100]] - animal: [["Centipede"]] - """ - def join_asof( - self, - right_table: Self, - on: str, - by: str | list[str], - tolerance: int, - right_on: str | list[str] | None = None, - right_by: str | list[str] | None = None, - ) -> Self: - """ - Perform an asof join between this table and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both tables must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Table, where further - operations can be applied. - - Parameters - ---------- - right_table : Table - The table to join to the current one, acting as the right table - in the join operation. - on : str - The column from current table that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input dataset must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current table that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row ``right.on - left.on <= tolerance``. The - ``tolerance`` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_table that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left table. - right_by : str or list[str], default None - The columns from the right_table that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left table. - - Returns - ------- - Table - - Example - -------- - >>> import pyarrow as pa - >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) - >>> t2 = pa.table( - ... { - ... "id": [3, 4], - ... "year": [2020, 2021], - ... "n_legs": [5, 100], - ... "animal": ["Brittle stars", "Centipede"], - ... } - ... ) - - >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) - pyarrow.Table - id: int64 - year: int64 - n_legs: int64 - animal: string - ---- - id: [[1,3,2,3,3]] - year: [[2020,2021,2022,2022,2023]] - n_legs: [[null,5,null,5,null]] - animal: [[null,"Brittle stars",null,"Brittle stars",null]] - """ - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the table as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the stream should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. - Currently, this is not supported and will raise a - NotImplementedError if the schema doesn't match the current schema. - - Returns - ------- - PyCapsule - """ - @property - def is_cpu(self) -> bool: - """ - Whether all ChunkedArrays are CPU-accessible. - """ - -def record_batch( - data: dict[str, list[Any] | Array[Any]] - | Collection[Array[Any]] - | pd.DataFrame - | SupportArrowArray - | SupportArrowDeviceArray, - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[Any, Any] | None = None, -) -> RecordBatch: - """ - Create a pyarrow.RecordBatch from another Python data structure or sequence - of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of Arrays, - a pandas DataFame, or any tabular object implementing the - Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or - ``__arrow_c_device_array__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the RecordBatch. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - - Returns - ------- - RecordBatch - - See Also - -------- - RecordBatch.from_arrays, RecordBatch.from_pandas, table - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a RecordBatch from a python dictionary: - - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() - n_legs animals - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - Creating a RecordBatch from a list of arrays with names: - - >>> pa.record_batch([n_legs, animals], names=names) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - - Creating a RecordBatch from a list of arrays with names and metadata: - - >>> my_metadata = {"n_legs": "How many legs does an animal have?"} - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,2,4,4,5,100] - animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] - >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'How many legs does an animal have?' - - Creating a RecordBatch from a pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022], - ... "month": [3, 5, 7, 9], - ... "day": [1, 5, 9, 13], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.record_batch(df) - pyarrow.RecordBatch - year: int64 - month: int64 - day: int64 - n_legs: int64 - animals: string - ---- - year: [2020,2022,2021,2022] - month: [3,5,7,9] - day: [1,5,9,13] - n_legs: [2,4,5,100] - animals: ["Flamingo","Horse","Brittle stars","Centipede"] - - >>> pa.record_batch(df).to_pandas() - year month day n_legs animals - 0 2020 3 1 2 Flamingo - 1 2022 5 5 4 Horse - 2 2021 7 9 5 Brittle stars - 3 2022 9 13 100 Centipede - - Creating a RecordBatch from a pandas DataFrame with schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.record_batch(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: ... - >>> pa.record_batch(df, my_schema).to_pandas() - n_legs animals - 0 2 Flamingo - 1 4 Horse - 2 5 Brittle stars - 3 100 Centipede - """ - -@overload -def table( - data: dict[str, list[Any] | Array[Any]], - schema: Schema | None = None, - metadata: Mapping[Any, Any] | None = None, - nthreads: int | None = None, -) -> Table: ... -@overload -def table( - data: Collection[ArrayOrChunkedArray[Any]] - | pd.DataFrame - | SupportArrowArray - | SupportArrowStream - | SupportArrowDeviceArray, - names: list[str] | None = None, - schema: Schema | None = None, - metadata: Mapping[Any, Any] | None = None, - nthreads: int | None = None, -) -> Table: ... -def table(*args, **kwargs): - """ - Create a pyarrow.Table from a Python data structure or sequence of arrays. - - Parameters - ---------- - data : dict, list, pandas.DataFrame, Arrow-compatible table - A mapping of strings to Arrays or Python lists, a list of arrays or - chunked arrays, a pandas DataFame, or any tabular object implementing - the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, - ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). - names : list, default None - Column names if list of arrays passed as data. Mutually exclusive with - 'schema' argument. - schema : Schema, default None - The expected schema of the Arrow Table. If not passed, will be inferred - from the data. Mutually exclusive with 'names' argument. - If passed, the output will have exactly this schema (raising an error - when columns are not found in the data and ignoring additional data not - specified in the schema, when data is a dict or DataFrame). - metadata : dict or Mapping, default None - Optional metadata for the schema (if schema not passed). - nthreads : int, default None - For pandas.DataFrame inputs: if greater than 1, convert columns to - Arrow in parallel using indicated number of threads. By default, - this follows :func:`pyarrow.cpu_count` (may use up to system CPU count - threads). - - Returns - ------- - Table - - See Also - -------- - Table.from_arrays, Table.from_pandas, Table.from_pydict - - Examples - -------- - >>> import pyarrow as pa - >>> n_legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> names = ["n_legs", "animals"] - - Construct a Table from a python dictionary: - - >>> pa.table({"n_legs": n_legs, "animals": animals}) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays: - - >>> pa.table([n_legs, animals], names=names) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from arrays with metadata: - - >>> my_metadata = {"n_legs": "Number of legs per animal"} - >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Construct a Table from pandas DataFrame: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2019, 2021], - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> pa.table(df) - pyarrow.Table - year: int64 - n_legs: int64 - animals: string - ---- - year: [[2020,2022,2019,2021]] - n_legs: [[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - - Construct a Table from pandas DataFrame with pyarrow schema: - - >>> my_schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> pa.table(df, my_schema).schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... - - Construct a Table from chunked arrays: - - >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) - >>> animals = pa.chunked_array( - ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] - ... ) - >>> table = pa.table([n_legs, animals], names=names) - >>> table - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,2,4],[4,5,100]] - animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] - """ - -def concat_tables( - tables: Iterable[Table], - memory_pool: MemoryPool | None = None, - promote_options: Literal["none", "default", "permissive"] = "none", - **kwargs: Any, -) -> Table: - """ - Concatenate pyarrow.Table objects. - - If promote_options="none", a zero-copy concatenation will be performed. The schemas - of all the Tables must be the same (except the metadata), otherwise an - exception will be raised. The result Table will share the metadata with the - first table. - - If promote_options="default", any null type arrays will be casted to the type of other - arrays in the column of the same name. If a table is missing a particular - field, null values of the appropriate type will be generated to take the - place of the missing field. The new schema will share the metadata with the - first table. Each field in the new schema will share the metadata with the - first table which has the field defined. Note that type promotions may - involve additional allocations on the given ``memory_pool``. - - If promote_options="permissive", the behavior of default plus types will be promoted - to the common denominator that fits all the fields. - - Parameters - ---------- - tables : iterable of pyarrow.Table objects - Pyarrow tables to concatenate into a single Table. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - promote_options : str, default none - Accepts strings "none", "default" and "permissive". - **kwargs : dict, optional - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.table( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) - >>> pa.concat_tables([t1, t2]) - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[2,4,5,100],[2,4]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] - - """ - -class TableGroupBy: - """ - A grouping of columns in a table on which to perform aggregations. - - Parameters - ---------- - table : pyarrow.Table - Input table to execute the aggregation on. - keys : str or list[str] - Name of the grouped columns. - use_threads : bool, default True - Whether to use multithreading or not. When set to True (the default), - no stable ordering of the output is guaranteed. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table( - ... [ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], - ... names=["keys", "values"], - ... ) - - Grouping of columns: - - >>> pa.TableGroupBy(t, "keys") - - - Perform aggregations: - - >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - """ - - keys: str | list[str] - def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... - def aggregate( - self, - aggregations: Iterable[ - tuple[ColumnSelector, Aggregation] - | tuple[ColumnSelector, Aggregation, AggregateOptions | None] - ], - ) -> Table: - """ - Perform an aggregation over the grouped columns of the table. - - Parameters - ---------- - aggregations : list[tuple(str, str)] or \ -list[tuple(str, str, FunctionOptions)] - List of tuples, where each tuple is one aggregation specification - and consists of: aggregation column name followed - by function name and optionally aggregation function option. - Pass empty list to get a single row for each group. - The column name can be a string, an empty list or a list of - column names, for unary, nullary and n-ary aggregation functions - respectively. - - For the list of function names and respective aggregation - function options see :ref:`py-grouped-aggrs`. - - Returns - ------- - Table - Results of the aggregation functions. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.table([ - ... pa.array(["a", "a", "b", "b", "c"]), - ... pa.array([1, 2, 3, 4, 5]), - ... ], names=["keys", "values"]) - - Sum the column "values" over the grouped column "keys": - - >>> t.group_by("keys").aggregate([("values", "sum")]) - pyarrow.Table - keys: string - values_sum: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - - Count the rows over the grouped column "keys": - - >>> t.group_by("keys").aggregate([([], "count_all")]) - pyarrow.Table - keys: string - count_all: int64 - ---- - keys: [["a","b","c"]] - count_all: [[2,2,1]] - - Do multiple aggregations: - - >>> t.group_by("keys").aggregate([ - ... ("values", "sum"), - ... ("keys", "count") - ... ]) - pyarrow.Table - keys: string - values_sum: int64 - keys_count: int64 - ---- - keys: [["a","b","c"]] - values_sum: [[3,7,5]] - keys_count: [[2,2,1]] - - Count the number of non-null values for column "values" - over the grouped column "keys": - - >>> import pyarrow.compute as pc - >>> t.group_by(["keys"]).aggregate([ - ... ("values", "count", pc.CountOptions(mode="only_valid")) - ... ]) - pyarrow.Table - keys: string - values_count: int64 - ---- - keys: [["a","b","c"]] - values_count: [[2,2,1]] - - Get a single row for each group in column "keys": - - >>> t.group_by("keys").aggregate([]) - pyarrow.Table - keys: string - ---- - keys: [["a","b","c"]] - """ - def _table(self) -> Table: ... - @property - def _use_threads(self) -> bool: ... - -def concat_batches( - recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None -) -> RecordBatch: - """ - Concatenate pyarrow.RecordBatch objects. - - All recordbatches must share the same Schema, - the operation implies a copy of the data to merge - the arrays of the different RecordBatches. - - Parameters - ---------- - recordbatches : iterable of pyarrow.RecordBatch objects - Pyarrow record batches to concatenate into a single RecordBatch. - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool. - - Examples - -------- - >>> import pyarrow as pa - >>> t1 = pa.record_batch( - ... [ - ... pa.array([2, 4, 5, 100]), - ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), - ... ], - ... names=["n_legs", "animals"], - ... ) - >>> t2 = pa.record_batch( - ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] - ... ) - >>> pa.concat_batches([t1, t2]) - pyarrow.RecordBatch - n_legs: int64 - animals: string - ---- - n_legs: [2,4,5,100,2,4] - animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] - - """ - -__all__ = [ - "ChunkedArray", - "chunked_array", - "_Tabular", - "RecordBatch", - "table_to_blocks", - "Table", - "record_batch", - "table", - "concat_tables", - "TableGroupBy", - "concat_batches", -] diff --git a/pyarrow-stubs/__lib_pxi/tensor.pyi b/pyarrow-stubs/__lib_pxi/tensor.pyi deleted file mode 100644 index d849abd0f1f..00000000000 --- a/pyarrow-stubs/__lib_pxi/tensor.pyi +++ /dev/null @@ -1,688 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self - -import numpy as np - -from pyarrow.lib import _Weakrefable -from scipy.sparse import coo_matrix, csr_matrix -from sparse import COO - -class Tensor(_Weakrefable): - """ - A n-dimensional array a.k.a Tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - - type: int32 - shape: (2, 3) - strides: (12, 4) - """ - - @classmethod - def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Create a Tensor from a numpy array. - - Parameters - ---------- - obj : numpy.ndarray - The source numpy array - dim_names : list, optional - Names of each dimension of the Tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - - type: int32 - shape: (2, 3) - strides: (12, 4) - """ - def to_numpy(self) -> np.ndarray: - """ - Convert arrow::Tensor to numpy.ndarray with zero copy - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.to_numpy() - array([[ 2, 2, 4], - [ 4, 5, 100]], dtype=int32) - """ - def equals(self, other: Tensor) -> bool: - """ - Return true if the tensors contains exactly equal data. - - Parameters - ---------- - other : Tensor - The other tensor to compare for equality. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) - >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) - >>> tensor.equals(tensor) - True - >>> tensor.equals(tensor2) - False - """ - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.dim_name(0) - 'dim1' - >>> tensor.dim_name(1) - 'dim2' - """ - @property - def dim_names(self) -> list[str]: - """ - Names of this tensor dimensions. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.dim_names - ['dim1', 'dim2'] - """ - @property - def is_mutable(self) -> bool: - """ - Is this tensor mutable or immutable. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.is_mutable - True - """ - @property - def is_contiguous(self) -> bool: - """ - Is this tensor contiguous in memory. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.is_contiguous - True - """ - @property - def ndim(self) -> int: - """ - The dimension (n) of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.ndim - 2 - """ - @property - def size(self) -> str: - """ - The size of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.size - 6 - """ - @property - def shape(self) -> tuple[int, ...]: - """ - The shape of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.shape - (2, 3) - """ - @property - def strides(self) -> tuple[int, ...]: - """ - Strides of this tensor. - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) - >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) - >>> tensor.strides - (12, 4) - """ - -class SparseCOOTensor(_Weakrefable): - @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCOOTensor - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCOOTensor - """ - - @classmethod - def from_numpy( - cls, - data: np.ndarray, - coords: np.ndarray, - shape: tuple[int, ...], - dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCOOTensor from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the rows. - coords : numpy.ndarray - Coordinates of the data. - shape : tuple - Shape of the tensor. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor - - Parameters - ---------- - obj : scipy.sparse.csr_matrix - The scipy matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: - """ - Convert pydata/sparse.COO to arrow::SparseCOOTensor. - - Parameters - ---------- - obj : pydata.sparse.COO - The sparse multidimensional array that should be converted. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCOOTensor. - - Parameters - ---------- - obj : Tensor - The tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. - """ - def to_scipy(self) -> coo_matrix: - """ - Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. - """ - def to_pydata_sparse(self) -> COO: - """ - Convert arrow::SparseCOOTensor to pydata/sparse.COO. - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCOOTensor to arrow::Tensor. - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data. - - Parameters - ---------- - other : SparseCOOTensor - The other tensor to compare for equality. - """ - @property - def is_mutable(self) -> bool: ... - @property - def ndim(self) -> int: ... - @property - def size(self) -> str: ... - @property - def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Returns - ------- - str - """ - @property - def dim_names(self) -> list[str]: ... - @property - def non_zero_length(self) -> int: ... - @property - def has_canonical_format(self) -> bool: ... - -class SparseCSRMatrix(_Weakrefable): - """ - A sparse CSR matrix. - """ - - @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSRMatrix - - Parameters - ---------- - obj : numpy.ndarray - The dense numpy array that should be converted. - dim_names : list, optional - The names of the dimensions. - - Returns - ------- - pyarrow.SparseCSRMatrix - """ - @classmethod - def from_numpy( - cls, - data: np.ndarray, - indptr: np.ndarray, - indices: np.ndarray, - shape: tuple[int, ...], - dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSRMatrix from numpy.ndarrays. - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse matrix. - indptr : numpy.ndarray - Range of the rows, - The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - indices : numpy.ndarray - Column indices of the corresponding non-zero values. - shape : tuple - Shape of the matrix. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. - - Parameters - ---------- - obj : scipy.sparse.csr_matrix - The scipy matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSRMatrix. - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. - """ - def to_scipy(self) -> csr_matrix: - """ - Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSRMatrix to arrow::Tensor. - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data. - - Parameters - ---------- - other : SparseCSRMatrix - The other tensor to compare for equality. - """ - @property - def is_mutable(self) -> bool: ... - @property - def ndim(self) -> int: ... - @property - def size(self) -> str: ... - @property - def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Returns - ------- - str - """ - @property - def dim_names(self) -> list[str]: ... - @property - def non_zero_length(self) -> int: ... - -class SparseCSCMatrix(_Weakrefable): - """ - A sparse CSC matrix. - """ - - @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCSCMatrix - """ - @classmethod - def from_numpy( - cls, - data: np.ndarray, - indptr: np.ndarray, - indices: np.ndarray, - shape: tuple[int, ...], - dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSCMatrix from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse matrix. - indptr : numpy.ndarray - Range of the rows, - The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - indices : numpy.ndarray - Column indices of the corresponding non-zero values. - shape : tuple - Shape of the matrix. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: - """ - Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : scipy.sparse.csc_matrix - The scipy matrix that should be converted. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSCMatrix - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy - """ - def to_scipy(self) -> csr_matrix: - """ - Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSCMatrix to arrow::Tensor - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data - - Parameters - ---------- - other : SparseCSCMatrix - The other tensor to compare for equality. - """ - @property - def is_mutable(self) -> bool: ... - @property - def ndim(self) -> int: ... - @property - def size(self) -> str: ... - @property - def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Returns - ------- - str - """ - @property - def dim_names(self) -> list[str]: ... - @property - def non_zero_length(self) -> int: ... - -class SparseCSFTensor(_Weakrefable): - """ - A sparse CSF tensor. - - CSF is a generalization of compressed sparse row (CSR) index. - - CSF index recursively compresses each dimension of a tensor into a set - of prefix trees. Each path from a root to leaf forms one tensor - non-zero index. CSF is implemented with two arrays of buffers and one - arrays of integers. - """ - - @classmethod - def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: - """ - Convert numpy.ndarray to arrow::SparseCSFTensor - - Parameters - ---------- - obj : numpy.ndarray - Data used to populate the rows. - dim_names : list[str], optional - Names of the dimensions. - - Returns - ------- - pyarrow.SparseCSFTensor - """ - @classmethod - def from_numpy( - cls, - data: np.ndarray, - indptr: np.ndarray, - indices: np.ndarray, - shape: tuple[int, ...], - dim_names: list[str] | None = None, - ) -> Self: - """ - Create arrow::SparseCSFTensor from numpy.ndarrays - - Parameters - ---------- - data : numpy.ndarray - Data used to populate the sparse tensor. - indptr : numpy.ndarray - The sparsity structure. - Each two consecutive dimensions in a tensor correspond to - a buffer in indices. - A pair of consecutive values at `indptr[dim][i]` - `indptr[dim][i + 1]` signify a range of nodes in - `indices[dim + 1]` who are children of `indices[dim][i]` node. - indices : numpy.ndarray - Stores values of nodes. - Each tensor dimension corresponds to a buffer in indptr. - shape : tuple - Shape of the matrix. - axis_order : list, optional - the sequence in which dimensions were traversed to - produce the prefix tree. - dim_names : list, optional - Names of the dimensions. - """ - @classmethod - def from_tensor(cls, obj: Tensor) -> Self: - """ - Convert arrow::Tensor to arrow::SparseCSFTensor - - Parameters - ---------- - obj : Tensor - The dense tensor that should be converted. - """ - def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy - """ - def to_tensor(self) -> Tensor: - """ - Convert arrow::SparseCSFTensor to arrow::Tensor - """ - def equals(self, other: Self) -> bool: - """ - Return true if sparse tensors contains exactly equal data - - Parameters - ---------- - other : SparseCSFTensor - The other tensor to compare for equality. - """ - @property - def is_mutable(self) -> bool: ... - @property - def ndim(self) -> int: ... - @property - def size(self) -> str: ... - @property - def shape(self) -> tuple[int, ...]: ... - def dim_name(self, i: int) -> str: - """ - Returns the name of the i-th tensor dimension. - - Parameters - ---------- - i : int - The physical index of the tensor dimension. - - Returns - ------- - str - """ - @property - def dim_names(self) -> list[str]: ... - @property - def non_zero_length(self) -> int: ... - -__all__ = [ - "Tensor", - "SparseCOOTensor", - "SparseCSRMatrix", - "SparseCSCMatrix", - "SparseCSFTensor", -] diff --git a/pyarrow-stubs/__lib_pxi/types.pyi b/pyarrow-stubs/__lib_pxi/types.pyi deleted file mode 100644 index 7fe6c36e332..00000000000 --- a/pyarrow-stubs/__lib_pxi/types.pyi +++ /dev/null @@ -1,4413 +0,0 @@ -import datetime as dt -import sys - -from collections.abc import Mapping, Sequence -from decimal import Decimal - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self - -from typing import Any, Generic, Iterable, Iterator, Literal, overload - -import numpy as np -import pandas as pd - -from pyarrow._stubs_typing import SupportArrowSchema -from pyarrow.lib import ( - Array, - ChunkedArray, - ExtensionArray, - MemoryPool, - MonthDayNano, - Table, -) -from typing_extensions import TypeVar, deprecated - -from .io import Buffer -from .scalar import ExtensionScalar - -_AsPyType = TypeVar("_AsPyType") -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) - -class _Weakrefable: ... -class _Metadata(_Weakrefable): ... - -class DataType(_Weakrefable): - """ - Base class of all Arrow data types. - - Each data type is an *instance* of this class. - - Examples - -------- - Instance of int64 type: - - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - """ - def field(self, i: int) -> Field: - """ - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Field - """ - @property - def id(self) -> int: ... - @property - def bit_width(self) -> int: - """ - Bit width for fixed width type. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().bit_width - 64 - """ - @property - def byte_width(self) -> int: - """ - Byte width for fixed width type. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().byte_width - 8 - """ - @property - def num_fields(self) -> int: - """ - The number of child fields. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> pa.int64().num_fields - 0 - >>> pa.list_(pa.string()) - ListType(list) - >>> pa.list_(pa.string()).num_fields - 1 - >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) - >>> struct.num_fields - 2 - """ - @property - def num_buffers(self) -> int: - """ - Number of data buffers required to construct Array type - excluding children. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().num_buffers - 2 - >>> pa.string().num_buffers - 3 - """ - def __hash__(self) -> int: ... - def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: - """ - Return true if type is equivalent to passed value. - - Parameters - ---------- - other : DataType or string convertible to DataType - check_metadata : bool - Whether nested Field metadata equality should be checked as well. - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().equals(pa.string()) - False - >>> pa.int64().equals(pa.int64()) - True - """ - def to_pandas_dtype(self) -> np.generic: - """ - Return the equivalent NumPy / Pandas dtype. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.int64().to_pandas_dtype() - - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import DataType from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ - @classmethod - def _import_from_c_capsule(cls, schema) -> Self: - """ - Import a DataType from a ArrowSchema PyCapsule - - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ - -class _BasicDataType(DataType, Generic[_AsPyType]): ... -class NullType(_BasicDataType[None]): ... -class BoolType(_BasicDataType[bool]): ... -class UInt8Type(_BasicDataType[int]): ... -class Int8Type(_BasicDataType[int]): ... -class UInt16Type(_BasicDataType[int]): ... -class Int16Type(_BasicDataType[int]): ... -class Uint32Type(_BasicDataType[int]): ... -class Int32Type(_BasicDataType[int]): ... -class UInt64Type(_BasicDataType[int]): ... -class Int64Type(_BasicDataType[int]): ... -class Float16Type(_BasicDataType[float]): ... -class Float32Type(_BasicDataType[float]): ... -class Float64Type(_BasicDataType[float]): ... -class Date32Type(_BasicDataType[dt.date]): ... -class Date64Type(_BasicDataType[dt.date]): ... -class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... -class StringType(_BasicDataType[str]): ... -class LargeStringType(_BasicDataType[str]): ... -class StringViewType(_BasicDataType[str]): ... -class BinaryType(_BasicDataType[bytes]): ... -class LargeBinaryType(_BasicDataType[bytes]): ... -class BinaryViewType(_BasicDataType[bytes]): ... - -_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) -_Tz = TypeVar("_Tz", str, None, default=None) - -class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): - """ - Concrete class for timestamp data types. - - Examples - -------- - >>> import pyarrow as pa - - Create an instance of timestamp type: - - >>> pa.timestamp("us") - TimestampType(timestamp[us]) - - Create an instance of timestamp type with timezone: - - >>> pa.timestamp("s", tz="UTC") - TimestampType(timestamp[s, tz=UTC]) - """ - @property - def unit(self) -> _Unit: - """ - The timestamp unit ('s', 'ms', 'us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.timestamp("us") - >>> t.unit - 'us' - """ - @property - def tz(self) -> _Tz: - """ - The timestamp time zone, if any, or None. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.timestamp("s", tz="UTC") - >>> t.tz - 'UTC' - """ - -_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) - -class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): - """ - Concrete class for time32 data types. - - Supported time unit resolutions are 's' [second] - and 'ms' [millisecond]. - - Examples - -------- - Create an instance of time32 type: - - >>> import pyarrow as pa - >>> pa.time32("ms") - Time32Type(time32[ms]) - """ - @property - def unit(self) -> _Time32Unit: - """ - The time unit ('s' or 'ms'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.time32("ms") - >>> t.unit - 'ms' - """ - -_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) - -class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): - """ - Concrete class for time64 data types. - - Supported time unit resolutions are 'us' [microsecond] - and 'ns' [nanosecond]. - - Examples - -------- - Create an instance of time64 type: - - >>> import pyarrow as pa - >>> pa.time64("us") - Time64Type(time64[us]) - """ - @property - def unit(self) -> _Time64Unit: - """ - The time unit ('us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.time64("us") - >>> t.unit - 'us' - """ - -class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): - """ - Concrete class for duration data types. - - Examples - -------- - Create an instance of duration type: - - >>> import pyarrow as pa - >>> pa.duration("s") - DurationType(duration[s]) - """ - @property - def unit(self) -> _Unit: - """ - The duration unit ('s', 'ms', 'us' or 'ns'). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.duration("s") - >>> t.unit - 's' - """ - -class FixedSizeBinaryType(_BasicDataType[Decimal]): - """ - Concrete class for fixed-size binary data types. - - Examples - -------- - Create an instance of fixed-size binary type: - - >>> import pyarrow as pa - >>> pa.binary(3) - FixedSizeBinaryType(fixed_size_binary[3]) - """ - -_Precision = TypeVar("_Precision", default=Any) -_Scale = TypeVar("_Scale", default=Any) - -class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal32 data types. - - Examples - -------- - Create an instance of decimal32 type: - - >>> import pyarrow as pa - >>> pa.decimal32(5, 2) - Decimal32Type(decimal32(5, 2)) - """ - @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal32(5, 2) - >>> t.precision - 5 - """ - @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal32(5, 2) - >>> t.scale - 2 - """ - -class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal64 data types. - - Examples - -------- - Create an instance of decimal64 type: - - >>> import pyarrow as pa - >>> pa.decimal64(5, 2) - Decimal64Type(decimal64(5, 2)) - """ - @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal64(5, 2) - >>> t.precision - 5 - """ - @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal64(5, 2) - >>> t.scale - 2 - """ - -class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal128 data types. - - Examples - -------- - Create an instance of decimal128 type: - - >>> import pyarrow as pa - >>> pa.decimal128(5, 2) - Decimal128Type(decimal128(5, 2)) - """ - @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal128(5, 2) - >>> t.precision - 5 - """ - @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal128(5, 2) - >>> t.scale - 2 - """ - -class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): - """ - Concrete class for decimal256 data types. - - Examples - -------- - Create an instance of decimal256 type: - - >>> import pyarrow as pa - >>> pa.decimal256(76, 38) - Decimal256Type(decimal256(76, 38)) - """ - @property - def precision(self) -> _Precision: - """ - The decimal precision, in number of decimal digits (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal256(76, 38) - >>> t.precision - 76 - """ - @property - def scale(self) -> _Scale: - """ - The decimal scale (an integer). - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.decimal256(76, 38) - >>> t.scale - 38 - """ - -class ListType(DataType, Generic[_DataTypeT]): - """ - Concrete class for list data types. - - Examples - -------- - Create an instance of ListType: - - >>> import pyarrow as pa - >>> pa.list_(pa.string()) - ListType(list) - """ - @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.string()).value_field - pyarrow.Field - """ - @property - def value_type(self) -> _DataTypeT: - """ - The data type of list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.string()).value_type - DataType(string) - """ - -class LargeListType(DataType, Generic[_DataTypeT]): - """ - Concrete class for large list data types - (like ListType, but with 64-bit offsets). - - Examples - -------- - Create an instance of LargeListType: - - >>> import pyarrow as pa - >>> pa.large_list(pa.string()) - LargeListType(large_list) - """ - @property - def value_field(self) -> Field[_DataTypeT]: ... - @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list(pa.string()).value_type - DataType(string) - """ - -class ListViewType(DataType, Generic[_DataTypeT]): - """ - Concrete class for list view data types. - - Examples - -------- - Create an instance of ListViewType: - - >>> import pyarrow as pa - >>> pa.list_view(pa.string()) - ListViewType(list_view) - """ - @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_view(pa.string()).value_field - pyarrow.Field - """ - @property - def value_type(self) -> _DataTypeT: - """ - The data type of list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_view(pa.string()).value_type - DataType(string) - """ - -class LargeListViewType(DataType, Generic[_DataTypeT]): - """ - Concrete class for large list view data types - (like ListViewType, but with 64-bit offsets). - - Examples - -------- - Create an instance of LargeListViewType: - - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()) - LargeListViewType(large_list_view) - """ - @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for large list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()).value_field - pyarrow.Field - """ - @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list view values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.large_list_view(pa.string()).value_type - DataType(string) - """ - -class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): - """ - Concrete class for fixed size list data types. - - Examples - -------- - Create an instance of FixedSizeListType: - - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2) - FixedSizeListType(fixed_size_list[2]) - """ - @property - def value_field(self) -> Field[_DataTypeT]: - """ - The field for list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).value_field - pyarrow.Field - """ - @property - def value_type(self) -> _DataTypeT: - """ - The data type of large list values. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).value_type - DataType(int32) - """ - @property - def list_size(self) -> _Size: - """ - The size of the fixed size lists. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.list_(pa.int32(), 2).list_size - 2 - """ - -class DictionaryMemo(_Weakrefable): - """ - Tracking container for dictionary-encoded fields. - """ - -_IndexT = TypeVar( - "_IndexT", - UInt8Type, - Int8Type, - UInt16Type, - Int16Type, - Uint32Type, - Int32Type, - UInt64Type, - Int64Type, -) -_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) -_ValueT = TypeVar("_ValueT", bound=DataType) -_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) - -class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): - """ - Concrete class for dictionary data types. - - Examples - -------- - Create an instance of dictionary type: - - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()) - DictionaryType(dictionary) - """ - - @property - def ordered(self) -> _Ordered: - """ - Whether the dictionary is ordered, i.e. whether the ordering of values - in the dictionary is important. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()).ordered - False - """ - @property - def index_type(self) -> _IndexT: - """ - The data type of dictionary indices (a signed integer type). - - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int16(), pa.utf8()).index_type - DataType(int16) - """ - @property - def value_type(self) -> _BasicValueT: - """ - The dictionary value type. - - The dictionary values are found in an instance of DictionaryArray. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.dictionary(pa.int16(), pa.utf8()).value_type - DataType(string) - """ - -_K = TypeVar("_K", bound=DataType) - -class MapType(DataType, Generic[_K, _ValueT, _Ordered]): - """ - Concrete class for map data types. - - Examples - -------- - Create an instance of MapType: - - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()) - MapType(map) - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) - MapType(map) - """ - - @property - def key_field(self) -> Field[_K]: - """ - The field for keys in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).key_field - pyarrow.Field - """ - @property - def key_type(self) -> _K: - """ - The data type of keys in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).key_type - DataType(string) - """ - @property - def item_field(self) -> Field[_ValueT]: - """ - The field for items in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).item_field - pyarrow.Field - """ - @property - def item_type(self) -> _ValueT: - """ - The data type of items in the map entries. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()).item_type - DataType(int32) - """ - @property - def keys_sorted(self) -> _Ordered: - """ - Should the entries be sorted according to keys. - - Examples - -------- - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted - True - """ - -_Size = TypeVar("_Size", default=int) - -class StructType(DataType): - """ - Concrete class for struct data types. - - ``StructType`` supports direct indexing using ``[...]`` (implemented via - ``__getitem__``) to access its fields. - It will return the struct field with the given index or name. - - Examples - -------- - >>> import pyarrow as pa - - Accessing fields using direct indexing: - - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) - >>> struct_type[0] - pyarrow.Field - >>> struct_type["y"] - pyarrow.Field - - Accessing fields using ``field()``: - - >>> struct_type.field(1) - pyarrow.Field - >>> struct_type.field("x") - pyarrow.Field - - # Creating a schema from the struct type's fields: - >>> pa.schema(list(struct_type)) - x: int32 - y: string - """ - def get_field_index(self, name: str) -> int: - """ - Return index of the unique field with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - index : int - The index of the field with the given name; -1 if the - name isn't found or there are several fields with the given - name. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) - - Index of the field with a name 'y': - - >>> struct_type.get_field_index("y") - 1 - - Index of the field that does not exist: - - >>> struct_type.get_field_index("z") - -1 - """ - def field(self, i: int | str) -> Field: - """ - Select a field by its column name or numeric index. - - Parameters - ---------- - i : int or str - - Returns - ------- - pyarrow.Field - - Examples - -------- - - >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) - - Select the second field: - - >>> struct_type.field(1) - pyarrow.Field - - Select the field named 'x': - - >>> struct_type.field("x") - pyarrow.Field - """ - def get_all_field_indices(self, name: str) -> list[int]: - """ - Return sorted list of indices for the fields with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - indices : List[int] - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) - >>> struct_type.get_all_field_indices("x") - [0] - """ - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[Field]: ... - __getitem__ = field # pyright: ignore[reportUnknownVariableType] - @property - def names(self) -> list[str]: - """ - Lists the field names. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) - >>> struct_type.names - ['a', 'b', 'c'] - """ - @property - def fields(self) -> list[Field]: - """ - Lists all fields within the StructType. - - Examples - -------- - >>> import pyarrow as pa - >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) - >>> struct_type.fields - [pyarrow.Field, pyarrow.Field, pyarrow.Field] - """ - -class UnionType(DataType): - """ - Base class for union data types. - - Examples - -------- - Create an instance of a dense UnionType using ``pa.union``: - - >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_DENSE, - ... ), - ... ) - (DenseUnionType(dense_union),) - - Create an instance of a dense UnionType using ``pa.dense_union``: - - >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - DenseUnionType(dense_union) - - Create an instance of a sparse UnionType using ``pa.union``: - - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_SPARSE, - ... ), - ... ) - (SparseUnionType(sparse_union),) - - Create an instance of a sparse UnionType using ``pa.sparse_union``: - - >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - SparseUnionType(sparse_union) - """ - @property - def mode(self) -> Literal["sparse", "dense"]: - """ - The mode of the union ("dense" or "sparse"). - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - >>> union.mode - 'sparse' - """ - @property - def type_codes(self) -> list[int]: - """ - The type code to indicate each data type in this union. - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - >>> union.type_codes - [0, 1] - """ - def __len__(self) -> int: ... - def __iter__(self) -> Iterator[Field]: ... - def field(self, i: int) -> Field: - """ - Return a child field by its numeric index. - - Parameters - ---------- - i : int - - Returns - ------- - pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - >>> union[0] - pyarrow.Field - """ - __getitem__ = field # pyright: ignore[reportUnknownVariableType] - -class SparseUnionType(UnionType): - """ - Concrete class for sparse union types. - - Examples - -------- - Create an instance of a sparse UnionType using ``pa.union``: - - >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_SPARSE, - ... ), - ... ) - (SparseUnionType(sparse_union),) - - Create an instance of a sparse UnionType using ``pa.sparse_union``: - - >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - SparseUnionType(sparse_union) - """ - @property - def mode(self) -> Literal["sparse"]: ... - -class DenseUnionType(UnionType): - """ - Concrete class for dense union types. - - Examples - -------- - Create an instance of a dense UnionType using ``pa.union``: - - >>> import pyarrow as pa - >>> ( - ... pa.union( - ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], - ... mode=pa.lib.UnionMode_DENSE, - ... ), - ... ) - (DenseUnionType(dense_union),) - - Create an instance of a dense UnionType using ``pa.dense_union``: - - >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) - DenseUnionType(dense_union) - """ - - @property - def mode(self) -> Literal["dense"]: ... - -_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) - -class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): - """ - Concrete class for run-end encoded types. - """ - @property - def run_end_type(self) -> _RunEndType: ... - @property - def value_type(self) -> _BasicValueT: ... - -_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) - -class BaseExtensionType(DataType): - """ - Concrete base class for extension types. - """ - def __arrow_ext_class__(self) -> type[ExtensionArray]: - """ - The associated array extension class - """ - def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: - """ - The associated scalar class - """ - @property - def extension_name(self) -> str: - """ - The extension type name. - """ - @property - def storage_type(self) -> DataType: - """ - The underlying storage type. - """ - def wrap_array(self, storage: _StorageT) -> _StorageT: ... - -class ExtensionType(BaseExtensionType): - """ - Concrete base class for Python-defined extension types. - - Parameters - ---------- - storage_type : DataType - The underlying storage type for the extension type. - extension_name : str - A unique name distinguishing this extension type. The name will be - used when deserializing IPC data. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Create an instance of RationalType extension type: - - >>> rational_type = RationalType(pa.int32()) - - Inspect the extension type: - - >>> rational_type.extension_name - 'my_package.rational' - >>> rational_type.storage_type - StructType(struct) - - Wrap an array as an extension array: - - >>> storage_array = pa.array( - ... [ - ... {"numer": 10, "denom": 17}, - ... {"numer": 20, "denom": 13}, - ... ], - ... type=rational_type.storage_type, - ... ) - >>> rational_array = rational_type.wrap_array(storage_array) - >>> rational_array - - -- is_valid: all not null - -- child 0 type: int32 - [ - 10, - 20 - ] - -- child 1 type: int32 - [ - 17, - 13 - ] - - Or do the same with creating an ExtensionArray: - - >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) - >>> rational_array - - -- is_valid: all not null - -- child 0 type: int32 - [ - 10, - 20 - ] - -- child 1 type: int32 - [ - 17, - 13 - ] - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - - Note that even though we registered the concrete type - ``RationalType(pa.int64())``, PyArrow will be able to deserialize - ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer - will reference the name ``my_package.rational`` and the ``@classmethod`` - ``__arrow_ext_deserialize__``. - """ - - def __init__(self, storage_type: DataType, extension_name: str) -> None: ... - def __arrow_ext_serialize__(self) -> bytes: - """ - Serialized representation of metadata to reconstruct the type object. - - This method should return a bytes object, and those serialized bytes - are stored in the custom metadata of the Field holding an extension - type in an IPC message. - The bytes are passed to ``__arrow_ext_deserialize`` and should hold - sufficient information to reconstruct the data type instance. - """ - @classmethod - def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: - """ - Return an extension type instance from the storage type and serialized - metadata. - - This method should return an instance of the ExtensionType subclass - that matches the passed storage type and serialized metadata (the - return value of ``__arrow_ext_serialize__``). - """ - -class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): - """ - Concrete class for fixed shape tensor extension type. - - Examples - -------- - Create an instance of fixed shape tensor extension type: - - >>> import pyarrow as pa - >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) - FixedShapeTensorType(extension) - - Create an instance of fixed shape tensor extension type with - permutation: - - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - """ - @property - def value_type(self) -> _ValueT: - """ - Data type of an individual tensor. - """ - @property - def shape(self) -> list[int]: - """ - Shape of the tensors. - """ - @property - def dim_names(self) -> list[str] | None: - """ - Explicit names of the dimensions. - """ - @property - def permutation(self) -> list[int] | None: - """ - Indices of the dimensions ordering. - """ - -class Bool8Type(BaseExtensionType): - """ - Concrete class for bool8 extension type. - - Bool8 is an alternate representation for boolean - arrays using 8 bits instead of 1 bit per value. The underlying - storage type is int8. - - Examples - -------- - Create an instance of bool8 extension type: - - >>> import pyarrow as pa - >>> pa.bool8() - Bool8Type(extension) - """ - -class UuidType(BaseExtensionType): - """ - Concrete class for UUID extension type. - """ - -class JsonType(BaseExtensionType): - """ - Concrete class for JSON extension type. - - Examples - -------- - Define the extension type for JSON array - - >>> import pyarrow as pa - >>> json_type = pa.json_(pa.large_utf8()) - - Create an extension array - - >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] - >>> storage = pa.array(arr, pa.large_utf8()) - >>> pa.ExtensionArray.from_storage(json_type, storage) - - [ - null, - "{ "id":30, "values":["a", "b"] }" - ] - """ - -class OpaqueType(BaseExtensionType): - """ - Concrete class for opaque extension type. - - Opaque is a placeholder for a type from an external (often non-Arrow) - system that could not be interpreted. - - Examples - -------- - Create an instance of opaque extension type: - - >>> import pyarrow as pa - >>> pa.opaque(pa.int32(), "geometry", "postgis") - OpaqueType(extension) - """ - @property - def type_name(self) -> str: - """ - The name of the type in the external system. - """ - @property - def vendor_name(self) -> str: - """ - The name of the external system. - """ - -@deprecated( - "This class is deprecated and its deserialization is disabled by default. " - ":class:`ExtensionType` is recommended instead." -) -class PyExtensionType(ExtensionType): - """ - Concrete base class for Python-defined extension types based on pickle - for (de)serialization. - - .. warning:: - This class is deprecated and its deserialization is disabled by default. - :class:`ExtensionType` is recommended instead. - - Parameters - ---------- - storage_type : DataType - The storage type for which the extension is built. - """ - def __init__(self, storage_type: DataType) -> None: ... - @classmethod - def set_auto_load(cls, value: bool) -> None: - """ - Enable or disable auto-loading of serialized PyExtensionType instances. - - Parameters - ---------- - value : bool - Whether to enable auto-loading. - """ - -class UnknownExtensionType(PyExtensionType): # type: ignore - """ - A concrete class for Python-defined extension types that refer to - an unknown Python implementation. - - Parameters - ---------- - storage_type : DataType - The storage type for which the extension is built. - serialized : bytes - The serialised output. - """ - def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... - -def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore - """ - Register a Python extension type. - - Registration is based on the extension name (so different registered types - need unique extension names). Registration needs an extension type - instance, but then works for any instance of the same subclass regardless - of parametrization of the type. - - Parameters - ---------- - ext_type : BaseExtensionType instance - The ExtensionType subclass to register. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - """ - -def unregister_extension_type(type_name: str) -> None: - """ - Unregister a Python extension type. - - Parameters - ---------- - type_name : str - The name of the ExtensionType subclass to unregister. - - Examples - -------- - Define a RationalType extension type subclassing ExtensionType: - - >>> import pyarrow as pa - >>> class RationalType(pa.ExtensionType): - ... def __init__(self, data_type: pa.DataType): - ... if not pa.types.is_integer(data_type): - ... raise TypeError(f"data_type must be an integer type not {data_type}") - ... super().__init__( - ... pa.struct( - ... [ - ... ("numer", data_type), - ... ("denom", data_type), - ... ], - ... ), - ... # N.B. This name does _not_ reference `data_type` so deserialization - ... # will work for _any_ integer `data_type` after registration - ... "my_package.rational", - ... ) - ... def __arrow_ext_serialize__(self) -> bytes: - ... # No parameters are necessary - ... return b"" - ... @classmethod - ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass - ... return RationalType(storage_type[0].type) - - Register the extension type: - - >>> pa.register_extension_type(RationalType(pa.int64())) - - Unregister the extension type: - - >>> pa.unregister_extension_type("my_package.rational") - """ - -class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): - """ - KeyValueMetadata - - Parameters - ---------- - __arg0__ : dict - A dict of the key-value metadata - **kwargs : optional - additional key-value metadata - """ - def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... - def equals(self, other: KeyValueMetadata) -> bool: ... - def __len__(self) -> int: ... - def __contains__(self, __key: object) -> bool: ... - def __getitem__(self, __key: Any) -> Any: ... - def __iter__(self) -> Iterator[bytes]: ... - def get_all(self, key: str) -> list[bytes]: ... - def to_dict(self) -> dict[bytes, bytes]: - """ - Convert KeyValueMetadata to dict. If a key occurs twice, the value for - the first one is returned - """ - -def ensure_metadata( - meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False -) -> KeyValueMetadata | None: ... - -class Field(_Weakrefable, Generic[_DataTypeT]): - """ - A named field, with a data type, nullability, and optional metadata. - - Notes - ----- - Do not use this class's constructor directly; use pyarrow.field - - Examples - -------- - Create an instance of pyarrow.Field: - - >>> import pyarrow as pa - >>> pa.field("key", pa.int32()) - pyarrow.Field - >>> pa.field("key", pa.int32(), nullable=False) - pyarrow.Field - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) - >>> field - pyarrow.Field - >>> field.metadata - {b'key': b'Something important'} - - Use the field to create a struct type: - - >>> pa.struct([field]) - StructType(struct) - """ - - def equals(self, other: Field, check_metadata: bool = False) -> bool: - """ - Test if this field is equal to the other - - Parameters - ---------- - other : pyarrow.Field - check_metadata : bool, default False - Whether Field metadata equality should be checked as well. - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field("key", pa.int32()) - >>> f2 = pa.field("key", pa.int32(), nullable=False) - >>> f1.equals(f2) - False - >>> f1.equals(f1) - True - """ - def __hash__(self) -> int: ... - @property - def nullable(self) -> bool: - """ - The field nullability. - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field("key", pa.int32()) - >>> f2 = pa.field("key", pa.int32(), nullable=False) - >>> f1.nullable - True - >>> f2.nullable - False - """ - @property - def name(self) -> str: - """ - The field name. - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) - >>> field.name - 'key' - """ - @property - def metadata(self) -> dict[bytes, bytes] | None: - """ - The field metadata (if any is set). - - Returns - ------- - metadata : dict or None - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) - >>> field.metadata - {b'key': b'Something important'} - """ - @property - def type(self) -> _DataTypeT: ... - def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: - """ - Add metadata as dict of string keys and values to Field - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) - - Create new field by adding metadata to existing one: - - >>> field_new = field.with_metadata({"key": "Something important"}) - >>> field_new - pyarrow.Field - >>> field_new.metadata - {b'key': b'Something important'} - """ - def remove_metadata(self) -> Self: - """ - Create new field without metadata, if any - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) - >>> field.metadata - {b'key': b'Something important'} - - Create new field by removing the metadata from the existing one: - - >>> field_new = field.remove_metadata() - >>> field_new.metadata - """ - def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: - """ - A copy of this field with the replaced type - - Parameters - ---------- - new_type : pyarrow.DataType - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) - >>> field - pyarrow.Field - - Create new field by replacing type of an existing one: - - >>> field_new = field.with_type(pa.int64()) - >>> field_new - pyarrow.Field - """ - def with_name(self, name: str) -> Self: - """ - A copy of this field with the replaced name - - Parameters - ---------- - name : str - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) - >>> field - pyarrow.Field - - Create new field by replacing the name of an existing one: - - >>> field_new = field.with_name("lock") - >>> field_new - pyarrow.Field - """ - def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: - """ - A copy of this field with the replaced nullability - - Parameters - ---------- - nullable : bool - - Returns - ------- - field: pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> field = pa.field("key", pa.int32()) - >>> field - pyarrow.Field - >>> field.nullable - True - - Create new field by replacing the nullability of an existing one: - - >>> field_new = field.with_nullable(False) - >>> field_new - pyarrow.Field - >>> field_new.nullable - False - """ - def flatten(self) -> list[Field]: - """ - Flatten this field. If a struct field, individual child fields - will be returned with their names prefixed by the parent's name. - - Returns - ------- - fields : List[pyarrow.Field] - - Examples - -------- - >>> import pyarrow as pa - >>> f1 = pa.field("bar", pa.float64(), nullable=False) - >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) - >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) - - Flatten a struct field: - - >>> ff - pyarrow.Field not null> - >>> ff.flatten() - [pyarrow.Field, pyarrow.Field] - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int) -> Self: - """ - Import Field from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ - @classmethod - def _import_from_c_capsule(cls, schema) -> Self: - """ - Import a Field from a ArrowSchema PyCapsule - - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ - -class Schema(_Weakrefable): - """ - A named collection of types a.k.a schema. A schema defines the - column names and types in a record batch or table data structure. - They also contain metadata about the columns. For example, schemas - converted from Pandas contain metadata about their original Pandas - types so they can be converted back to the same types. - - Warnings - -------- - Do not call this class's constructor directly. Instead use - :func:`pyarrow.schema` factory function which makes a new Arrow - Schema object. - - Examples - -------- - Create a new Arrow Schema object: - - >>> import pyarrow as pa - >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) - some_int: int32 - some_string: string - - Create Arrow Schema with metadata: - - >>> pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - - def __len__(self) -> int: ... - def __getitem__(self, key: str) -> Field: ... - _field = __getitem__ # pyright: ignore[reportUnknownVariableType] - def __iter__(self) -> Iterator[Field]: ... - def __hash__(self) -> int: ... - def __sizeof__(self) -> int: ... - @property - def pandas_metadata(self) -> dict: - """ - Return deserialized-from-JSON pandas metadata field (if it exists) - - Examples - -------- - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_legs": [2, 4, 5, 100], - ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> schema = pa.Table.from_pandas(df).schema - - Select pandas metadata field from Arrow Schema: - - >>> schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... - """ - @property - def names(self) -> list[str]: - """ - The schema's field names. - - Returns - ------- - list of str - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Get the names of the schema's fields: - - >>> schema.names - ['n_legs', 'animals'] - """ - @property - def types(self) -> list[DataType]: - """ - The schema's field types. - - Returns - ------- - list of DataType - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Get the types of the schema's fields: - - >>> schema.types - [DataType(int64), DataType(string)] - """ - @property - def metadata(self) -> dict[bytes, bytes]: - """ - The schema's metadata (if any is set). - - Returns - ------- - metadata: dict or None - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - - Get the metadata of the schema's fields: - - >>> schema.metadata - {b'n_legs': b'Number of legs per animal'} - """ - def empty_table(self) -> Table: - """ - Provide an empty table according to the schema. - - Returns - ------- - table: pyarrow.Table - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Create an empty table with schema's fields: - - >>> schema.empty_table() - pyarrow.Table - n_legs: int64 - animals: string - ---- - n_legs: [[]] - animals: [[]] - """ - def equals(self, other: Schema, check_metadata: bool = False) -> bool: - """ - Test if this schema is equal to the other - - Parameters - ---------- - other : pyarrow.Schema - check_metadata : bool, default False - Key/value metadata must be equal too - - Returns - ------- - is_equal : bool - - Examples - -------- - >>> import pyarrow as pa - >>> schema1 = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) - - Test two equal schemas: - - >>> schema1.equals(schema1) - True - - Test two unequal schemas: - - >>> schema1.equals(schema2) - False - """ - @classmethod - def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: - """ - Returns implied schema from dataframe - - Parameters - ---------- - df : pandas.DataFrame - preserve_index : bool, default True - Whether to store the index as an additional column (or columns, for - MultiIndex) in the resulting `Table`. - The default of None will store the index as a column, except for - RangeIndex which is stored as metadata only. Use - ``preserve_index=True`` to force it to be stored as a column. - - Returns - ------- - pyarrow.Schema - - Examples - -------- - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) - - Create an Arrow Schema from the schema of a pandas dataframe: - - >>> pa.Schema.from_pandas(df) - int: int64 - str: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, ... - """ - def field(self, i: int | str | bytes) -> Field: - """ - Select a field by its column name or numeric index. - - Parameters - ---------- - i : int or string - - Returns - ------- - pyarrow.Field - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Select the second field: - - >>> schema.field(1) - pyarrow.Field - - Select the field of the column named 'n_legs': - - >>> schema.field("n_legs") - pyarrow.Field - """ - @deprecated("Use 'field' instead") - def field_by_name(self, name: str) -> Field: - """ - DEPRECATED - - Parameters - ---------- - name : str - - Returns - ------- - field: pyarrow.Field - """ - def get_field_index(self, name: str) -> int: - """ - Return index of the unique field with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - index : int - The index of the field with the given name; -1 if the - name isn't found or there are several fields with the given - name. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Get the index of the field named 'animals': - - >>> schema.get_field_index("animals") - 1 - - Index in case of several fields with the given name: - - >>> schema = pa.schema( - ... [ - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... pa.field("animals", pa.bool_()), - ... ], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> schema.get_field_index("animals") - -1 - """ - def get_all_field_indices(self, name: str) -> list[int]: - """ - Return sorted list of indices for the fields with the given name. - - Parameters - ---------- - name : str - The name of the field to look up. - - Returns - ------- - indices : List[int] - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema( - ... [ - ... pa.field("n_legs", pa.int64()), - ... pa.field("animals", pa.string()), - ... pa.field("animals", pa.bool_()), - ... ] - ... ) - - Get the indexes of the fields named 'animals': - - >>> schema.get_all_field_indices("animals") - [1, 2] - """ - def append(self, field: Field) -> Schema: - """ - Append a field at the end of the schema. - - In contrast to Python's ``list.append()`` it does return a new - object, leaving the original Schema unmodified. - - Parameters - ---------- - field : Field - - Returns - ------- - schema: Schema - New object with appended field. - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Append a field 'extra' at the end of the schema: - - >>> schema_new = schema.append(pa.field("extra", pa.bool_())) - >>> schema_new - n_legs: int64 - animals: string - extra: bool - - Original schema is unmodified: - - >>> schema - n_legs: int64 - animals: string - """ - def insert(self, i: int, field: Field) -> Schema: - """ - Add a field at position i to the schema. - - Parameters - ---------- - i : int - field : Field - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Insert a new field on the second position: - - >>> schema.insert(1, pa.field("extra", pa.bool_())) - n_legs: int64 - extra: bool - animals: string - """ - def remove(self, i: int) -> Schema: - """ - Remove the field at index i from the schema. - - Parameters - ---------- - i : int - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Remove the second field of the schema: - - >>> schema.remove(1) - n_legs: int64 - """ - def set(self, i: int, field: Field) -> Schema: - """ - Replace a field at position i in the schema. - - Parameters - ---------- - i : int - field : Field - - Returns - ------- - schema: Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Replace the second field of the schema with a new field 'extra': - - >>> schema.set(1, pa.field("replaced", pa.bool_())) - n_legs: int64 - replaced: bool - """ - @deprecated("Use 'with_metadata' instead") - def add_metadata(self, metadata: dict) -> Schema: - """ - DEPRECATED - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - """ - def with_metadata(self, metadata: dict) -> Schema: - """ - Add metadata as dict of string keys and values to Schema - - Parameters - ---------- - metadata : dict - Keys and values must be string-like / coercible to bytes - - Returns - ------- - schema : pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Add metadata to existing schema field: - - >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - """ - def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: - """ - Write Schema to Buffer as encapsulated IPC message - - Parameters - ---------- - memory_pool : MemoryPool, default None - Uses default memory pool if not specified - - Returns - ------- - serialized : Buffer - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) - - Write schema to Buffer: - - >>> schema.serialize() - - """ - def remove_metadata(self) -> Schema: - """ - Create new schema without metadata, if any - - Returns - ------- - schema : pyarrow.Schema - - Examples - -------- - >>> import pyarrow as pa - >>> schema = pa.schema( - ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], - ... metadata={"n_legs": "Number of legs per animal"}, - ... ) - >>> schema - n_legs: int64 - animals: string - -- schema metadata -- - n_legs: 'Number of legs per animal' - - Create a new schema with removing the metadata from the original: - - >>> schema.remove_metadata() - n_legs: int64 - animals: string - """ - def to_string( - self, - truncate_metadata: bool = True, - show_field_metadata: bool = True, - show_schema_metadata: bool = True, - ) -> str: - """ - Return human-readable representation of Schema - - Parameters - ---------- - truncate_metadata : boolean, default True - Limit metadata key/value display to a single line of ~80 characters - or less - show_field_metadata : boolean, default True - Display Field-level KeyValueMetadata - show_schema_metadata : boolean, default True - Display Schema-level KeyValueMetadata - - Returns - ------- - str : the formatted output - """ - def _export_to_c(self, out_ptr: int) -> None: - """ - Export to a C ArrowSchema struct, given its pointer. - - Be careful: if you don't pass the ArrowSchema struct to a consumer, - its memory will leak. This is a low-level function intended for - expert users. - """ - @classmethod - def _import_from_c(cls, in_ptr: int) -> Schema: - """ - Import Schema from a C ArrowSchema struct, given its pointer. - - This is a low-level function intended for expert users. - """ - def __arrow_c_schema__(self) -> Any: - """ - Export to a ArrowSchema PyCapsule - - Unlike _export_to_c, this will not leak memory if the capsule is not used. - """ - @staticmethod - def _import_from_c_capsule(schema: Any) -> Schema: - """ - Import a Schema from a ArrowSchema PyCapsule - - Parameters - ---------- - schema : PyCapsule - A valid PyCapsule with name 'arrow_schema' containing an - ArrowSchema pointer. - """ - -def unify_schemas( - schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" -) -> Schema: - """ - Unify schemas by merging fields by name. - - The resulting schema will contain the union of fields from all schemas. - Fields with the same name will be merged. Note that two fields with - different types will fail merging by default. - - - The unified field will inherit the metadata from the schema where - that field is first defined. - - The first N fields in the schema will be ordered the same as the - N fields in the first schema. - - The resulting schema will inherit its metadata from the first input - schema. - - Parameters - ---------- - schemas : list of Schema - Schemas to merge into a single one. - promote_options : str, default default - Accepts strings "default" and "permissive". - Default: null and only null can be unified with another type. - Permissive: types are promoted to the greater common denominator. - - Returns - ------- - Schema - - Raises - ------ - ArrowInvalid : - If any input schema contains fields with duplicate names. - If Fields of the same name are not mergeable. - """ - -@overload -def field(name: SupportArrowSchema) -> Field[Any]: ... -@overload -def field( - name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None -) -> Field[_DataTypeT]: ... -def field(*args, **kwargs): - """ - Create a pyarrow.Field instance. - - Parameters - ---------- - name : str or bytes - Name of the field. - Alternatively, you can also pass an object that implements the Arrow - PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). - type : pyarrow.DataType or str - Arrow datatype of the field or a string matching one. - nullable : bool, default True - Whether the field's values are nullable. - metadata : dict, default None - Optional field metadata, the keys and values must be coercible to - bytes. - - Returns - ------- - field : pyarrow.Field - - Examples - -------- - Create an instance of pyarrow.Field: - - >>> import pyarrow as pa - >>> pa.field("key", pa.int32()) - pyarrow.Field - >>> pa.field("key", pa.int32(), nullable=False) - pyarrow.Field - - >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) - >>> field - pyarrow.Field - >>> field.metadata - {b'key': b'Something important'} - - Use the field to create a struct type: - - >>> pa.struct([field]) - StructType(struct) - - A str can also be passed for the type parameter: - - >>> pa.field("key", "int32") - pyarrow.Field - """ - -def null() -> NullType: - """ - Create instance of null type. - - Examples - -------- - Create an instance of a null type: - - >>> import pyarrow as pa - >>> pa.null() - DataType(null) - >>> print(pa.null()) - null - - Create a ``Field`` type with a null type and a name: - - >>> pa.field("null_field", pa.null()) - pyarrow.Field - """ - -def bool_() -> BoolType: - """ - Create instance of boolean type. - - Examples - -------- - Create an instance of a boolean type: - - >>> import pyarrow as pa - >>> pa.bool_() - DataType(bool) - >>> print(pa.bool_()) - bool - - Create a ``Field`` type with a boolean type - and a name: - - >>> pa.field("bool_field", pa.bool_()) - pyarrow.Field - """ - -def uint8() -> UInt8Type: - """ - Create instance of unsigned int8 type. - - Examples - -------- - Create an instance of unsigned int8 type: - - >>> import pyarrow as pa - >>> pa.uint8() - DataType(uint8) - >>> print(pa.uint8()) - uint8 - - Create an array with unsigned int8 type: - - >>> pa.array([0, 1, 2], type=pa.uint8()) - - [ - 0, - 1, - 2 - ] - """ - -def int8() -> Int8Type: - """ - Create instance of signed int8 type. - - Examples - -------- - Create an instance of int8 type: - - >>> import pyarrow as pa - >>> pa.int8() - DataType(int8) - >>> print(pa.int8()) - int8 - - Create an array with int8 type: - - >>> pa.array([0, 1, 2], type=pa.int8()) - - [ - 0, - 1, - 2 - ] - """ - -def uint16() -> UInt16Type: - """ - Create instance of unsigned uint16 type. - - Examples - -------- - Create an instance of unsigned int16 type: - - >>> import pyarrow as pa - >>> pa.uint16() - DataType(uint16) - >>> print(pa.uint16()) - uint16 - - Create an array with unsigned int16 type: - - >>> pa.array([0, 1, 2], type=pa.uint16()) - - [ - 0, - 1, - 2 - ] - """ - -def int16() -> Int16Type: - """ - Create instance of signed int16 type. - - Examples - -------- - Create an instance of int16 type: - - >>> import pyarrow as pa - >>> pa.int16() - DataType(int16) - >>> print(pa.int16()) - int16 - - Create an array with int16 type: - - >>> pa.array([0, 1, 2], type=pa.int16()) - - [ - 0, - 1, - 2 - ] - """ - -def uint32() -> Uint32Type: - """ - Create instance of unsigned uint32 type. - - Examples - -------- - Create an instance of unsigned int32 type: - - >>> import pyarrow as pa - >>> pa.uint32() - DataType(uint32) - >>> print(pa.uint32()) - uint32 - - Create an array with unsigned int32 type: - - >>> pa.array([0, 1, 2], type=pa.uint32()) - - [ - 0, - 1, - 2 - ] - """ - -def int32() -> Int32Type: - """ - Create instance of signed int32 type. - - Examples - -------- - Create an instance of int32 type: - - >>> import pyarrow as pa - >>> pa.int32() - DataType(int32) - >>> print(pa.int32()) - int32 - - Create an array with int32 type: - - >>> pa.array([0, 1, 2], type=pa.int32()) - - [ - 0, - 1, - 2 - ] - """ - -def int64() -> Int64Type: - """ - Create instance of signed int64 type. - - Examples - -------- - Create an instance of int64 type: - - >>> import pyarrow as pa - >>> pa.int64() - DataType(int64) - >>> print(pa.int64()) - int64 - - Create an array with int64 type: - - >>> pa.array([0, 1, 2], type=pa.int64()) - - [ - 0, - 1, - 2 - ] - """ - -def uint64() -> UInt64Type: - """ - Create instance of unsigned uint64 type. - - Examples - -------- - Create an instance of unsigned int64 type: - - >>> import pyarrow as pa - >>> pa.uint64() - DataType(uint64) - >>> print(pa.uint64()) - uint64 - - Create an array with unsigned uint64 type: - - >>> pa.array([0, 1, 2], type=pa.uint64()) - - [ - 0, - 1, - 2 - ] - """ - -def tzinfo_to_string(tz: dt.tzinfo) -> str: - """ - Converts a time zone object into a string indicating the name of a time - zone, one of: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - tz : datetime.tzinfo - Time zone object - - Returns - ------- - name : str - Time zone name - """ - -def string_to_tzinfo(name: str) -> dt.tzinfo: - """ - Convert a time zone name into a time zone object. - - Supported input strings are: - * As used in the Olson time zone database (the "tz database" or - "tzdata"), such as "America/New_York" - * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - - Parameters - ---------- - name: str - Time zone name. - - Returns - ------- - tz : datetime.tzinfo - Time zone object - """ - -@overload -def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... -@overload -def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... -def timestamp(*args, **kwargs): - """ - Create instance of timestamp type with resolution and optional time zone. - - Parameters - ---------- - unit : str - one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' - [nanosecond] - tz : str, default None - Time zone name. None indicates time zone naive - - Examples - -------- - Create an instance of timestamp type: - - >>> import pyarrow as pa - >>> pa.timestamp("us") - TimestampType(timestamp[us]) - >>> pa.timestamp("s", tz="America/New_York") - TimestampType(timestamp[s, tz=America/New_York]) - >>> pa.timestamp("s", tz="+07:30") - TimestampType(timestamp[s, tz=+07:30]) - - Use timestamp type when creating a scalar object: - - >>> from datetime import datetime - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) - - >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) - - - Returns - ------- - timestamp_type : TimestampType - """ - -def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: - """ - Create instance of 32-bit time (time of day) type with unit resolution. - - Parameters - ---------- - unit : str - one of 's' [second], or 'ms' [millisecond] - - Returns - ------- - type : pyarrow.Time32Type - - Examples - -------- - >>> import pyarrow as pa - >>> pa.time32("s") - Time32Type(time32[s]) - >>> pa.time32("ms") - Time32Type(time32[ms]) - """ - -def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: - """ - Create instance of 64-bit time (time of day) type with unit resolution. - - Parameters - ---------- - unit : str - One of 'us' [microsecond], or 'ns' [nanosecond]. - - Returns - ------- - type : pyarrow.Time64Type - - Examples - -------- - >>> import pyarrow as pa - >>> pa.time64("us") - Time64Type(time64[us]) - >>> pa.time64("ns") - Time64Type(time64[ns]) - """ - -def duration(unit: _Unit) -> DurationType[_Unit]: - """ - Create instance of a duration type with unit resolution. - - Parameters - ---------- - unit : str - One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or - 'ns' [nanosecond]. - - Returns - ------- - type : pyarrow.DurationType - - Examples - -------- - Create an instance of duration type: - - >>> import pyarrow as pa - >>> pa.duration("us") - DurationType(duration[us]) - >>> pa.duration("s") - DurationType(duration[s]) - - Create an array with duration type: - - >>> pa.array([0, 1, 2], type=pa.duration("s")) - - [ - 0, - 1, - 2 - ] - """ - -def month_day_nano_interval() -> MonthDayNanoIntervalType: - """ - Create instance of an interval type representing months, days and - nanoseconds between two dates. - - Examples - -------- - Create an instance of an month_day_nano_interval type: - - >>> import pyarrow as pa - >>> pa.month_day_nano_interval() - DataType(month_day_nano_interval) - - Create a scalar with month_day_nano_interval type: - - >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) - - """ - -def date32() -> Date32Type: - """ - Create instance of 32-bit date (days since UNIX epoch 1970-01-01). - - Examples - -------- - Create an instance of 32-bit date type: - - >>> import pyarrow as pa - >>> pa.date32() - DataType(date32[day]) - - Create a scalar with 32-bit date type: - - >>> from datetime import date - >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) - - """ - -def date64() -> Date64Type: - """ - Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). - - Examples - -------- - Create an instance of 64-bit date type: - - >>> import pyarrow as pa - >>> pa.date64() - DataType(date64[ms]) - - Create a scalar with 64-bit date type: - - >>> from datetime import datetime - >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) - - """ - -def float16() -> Float16Type: - """ - Create half-precision floating point type. - - Examples - -------- - Create an instance of float16 type: - - >>> import pyarrow as pa - >>> pa.float16() - DataType(halffloat) - >>> print(pa.float16()) - halffloat - - Create an array with float16 type: - - >>> arr = np.array([1.5, np.nan], dtype=np.float16) - >>> a = pa.array(arr, type=pa.float16()) - >>> a - - [ - 15872, - 32256 - ] - - Note that unlike other float types, if you convert this array - to a python list, the types of its elements will be ``np.float16`` - - >>> [type(val) for val in a.to_pylist()] - [, ] - """ - -def float32() -> Float32Type: - """ - Create single-precision floating point type. - - Examples - -------- - Create an instance of float32 type: - - >>> import pyarrow as pa - >>> pa.float32() - DataType(float) - >>> print(pa.float32()) - float - - Create an array with float32 type: - - >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) - - [ - 0, - 1, - 2 - ] - """ - -def float64() -> Float64Type: - """ - Create double-precision floating point type. - - Examples - -------- - Create an instance of float64 type: - - >>> import pyarrow as pa - >>> pa.float64() - DataType(double) - >>> print(pa.float64()) - double - - Create an array with float64 type: - - >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) - - [ - 0, - 1, - 2 - ] - """ - -@overload -def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... -@overload -def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... -def decimal32(*args, **kwargs): - """ - Create decimal type with precision and scale and 32-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal32(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 32-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal32(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 32-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 9 significant digits, consider - using ``decimal64``, ``decimal128``, or ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 9 - scale : int - - Returns - ------- - decimal_type : Decimal32Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal32(5, 2) - Decimal32Type(decimal32(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal("123.45") - >>> pa.array([a], pa.decimal32(5, 2)) - - [ - 123.45 - ] - """ - -@overload -def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... -@overload -def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... -def decimal64(*args, **kwargs): - """ - Create decimal type with precision and scale and 64-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal64(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 64-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal64(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 64-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 18 significant digits, consider - using ``decimal128``, or ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 18 - scale : int - - Returns - ------- - decimal_type : Decimal64Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal64(5, 2) - Decimal64Type(decimal64(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal("123.45") - >>> pa.array([a], pa.decimal64(5, 2)) - - [ - 123.45 - ] - """ - -@overload -def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... -@overload -def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... -def decimal128(*args, **kwargs): - """ - Create decimal type with precision and scale and 128-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - As an example, ``decimal128(7, 3)`` can exactly represent the numbers - 1234.567 and -1234.567 (encoded internally as the 128-bit integers - 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. - - ``decimal128(5, -3)`` can exactly represent the number 12345000 - (encoded internally as the 128-bit integer 12345), but neither - 123450000 nor 1234500. - - If you need a precision higher than 38 significant digits, consider - using ``decimal256``. - - Parameters - ---------- - precision : int - Must be between 1 and 38 - scale : int - - Returns - ------- - decimal_type : Decimal128Type - - Examples - -------- - Create an instance of decimal type: - - >>> import pyarrow as pa - >>> pa.decimal128(5, 2) - Decimal128Type(decimal128(5, 2)) - - Create an array with decimal type: - - >>> import decimal - >>> a = decimal.Decimal("123.45") - >>> pa.array([a], pa.decimal128(5, 2)) - - [ - 123.45 - ] - """ - -@overload -def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... -@overload -def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... -def decimal256(*args, **kwargs): - """ - Create decimal type with precision and scale and 256-bit width. - - Arrow decimals are fixed-point decimal numbers encoded as a scaled - integer. The precision is the number of significant digits that the - decimal type can represent; the scale is the number of digits after - the decimal point (note the scale can be negative). - - For most use cases, the maximum precision offered by ``decimal128`` - is sufficient, and it will result in a more compact and more efficient - encoding. ``decimal256`` is useful if you need a precision higher - than 38 significant digits. - - Parameters - ---------- - precision : int - Must be between 1 and 76 - scale : int - - Returns - ------- - decimal_type : Decimal256Type - """ - -def string() -> StringType: - """ - Create UTF8 variable-length string type. - - Examples - -------- - Create an instance of a string type: - - >>> import pyarrow as pa - >>> pa.string() - DataType(string) - - and use the string type to create an array: - - >>> pa.array(["foo", "bar", "baz"], type=pa.string()) - - [ - "foo", - "bar", - "baz" - ] - """ - -utf8 = string -""" -Alias for string(). - -Examples --------- -Create an instance of a string type: - ->>> import pyarrow as pa ->>> pa.utf8() -DataType(string) - -and use the string type to create an array: - ->>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) - -[ - "foo", - "bar", - "baz" -] -""" - -@overload -def binary(length: Literal[-1] = ...) -> BinaryType: ... -@overload -def binary(length: int) -> FixedSizeBinaryType: ... -def binary(length): - """ - Create variable-length or fixed size binary type. - - Parameters - ---------- - length : int, optional, default -1 - If length == -1 then return a variable length binary type. If length is - greater than or equal to 0 then return a fixed size binary type of - width `length`. - - Examples - -------- - Create an instance of a variable-length binary type: - - >>> import pyarrow as pa - >>> pa.binary() - DataType(binary) - - and use the variable-length binary type to create an array: - - >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) - - [ - 666F6F, - 626172, - 62617A - ] - - Create an instance of a fixed-size binary type: - - >>> pa.binary(3) - FixedSizeBinaryType(fixed_size_binary[3]) - - and use the fixed-length binary type to create an array: - - >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) - - [ - 666F6F, - 626172, - 62617A - ] - """ - -def large_binary() -> LargeBinaryType: - """ - Create large variable-length binary type. - - This data type may not be supported by all Arrow implementations. Unless - you need to represent data larger than 2GB, you should prefer binary(). - - Examples - -------- - Create an instance of large variable-length binary type: - - >>> import pyarrow as pa - >>> pa.large_binary() - DataType(large_binary) - - and use the type to create an array: - - >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) - - [ - 666F6F, - 626172, - 62617A - ] - """ - -def large_string() -> LargeStringType: - """ - Create large UTF8 variable-length string type. - - This data type may not be supported by all Arrow implementations. Unless - you need to represent data larger than 2GB, you should prefer string(). - - Examples - -------- - Create an instance of large UTF8 variable-length binary type: - - >>> import pyarrow as pa - >>> pa.large_string() - DataType(large_string) - - and use the type to create an array: - - >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) - - [ - "foo", - "bar", - ... - "foo", - "bar" - ] - """ - -large_utf8 = large_string -""" -Alias for large_string(). - -Examples --------- -Create an instance of large UTF8 variable-length binary type: - ->>> import pyarrow as pa ->>> pa.large_utf8() -DataType(large_string) - -and use the type to create an array: - ->>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) - -[ - "foo", - "bar", - ... - "foo", - "bar" -] -""" - -def binary_view() -> BinaryViewType: - """ - Create a variable-length binary view type. - - Examples - -------- - Create an instance of a string type: - - >>> import pyarrow as pa - >>> pa.binary_view() - DataType(binary_view) - """ - -def string_view() -> StringViewType: - """ - Create UTF8 variable-length string view type. - - Examples - -------- - Create an instance of a string type: - - >>> import pyarrow as pa - >>> pa.string_view() - DataType(string_view) - """ - -@overload -def list_( - value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... -) -> ListType[_DataTypeT]: ... -@overload -def list_( - value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size -) -> FixedSizeListType[_DataTypeT, _Size]: ... -def list_(*args, **kwargs): - """ - Create ListType instance from child data type or field. - - Parameters - ---------- - value_type : DataType or Field - list_size : int, optional, default -1 - If length == -1 then return a variable length list type. If length is - greater than or equal to 0 then return a fixed size list type. - - Returns - ------- - list_type : DataType - - Examples - -------- - Create an instance of ListType: - - >>> import pyarrow as pa - >>> pa.list_(pa.string()) - ListType(list) - >>> pa.list_(pa.int32(), 2) - FixedSizeListType(fixed_size_list[2]) - - Use the ListType to create a scalar: - - >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) - - - or an array: - - >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) - - [ - [ - 1, - 2 - ], - [ - 3, - 4 - ] - ] - """ - -def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: - """ - Create LargeListType instance from child data type or field. - - This data type may not be supported by all Arrow implementations. - Unless you need to represent data larger than 2**31 elements, you should - prefer list_(). - - Parameters - ---------- - value_type : DataType or Field - - Returns - ------- - list_type : DataType - - Examples - -------- - Create an instance of LargeListType: - - >>> import pyarrow as pa - >>> pa.large_list(pa.int8()) - LargeListType(large_list) - - Use the LargeListType to create an array: - - >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) - - [ - [ - -1, - 3 - ], - [ - -1, - 3 - ], - ... - """ - -def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: - """ - Create ListViewType instance from child data type or field. - - This data type may not be supported by all Arrow implementations - because it is an alternative to the ListType. - - Parameters - ---------- - value_type : DataType or Field - - Returns - ------- - list_view_type : DataType - - Examples - -------- - Create an instance of ListViewType: - - >>> import pyarrow as pa - >>> pa.list_view(pa.string()) - ListViewType(list_view) - """ - -def large_list_view( - value_type: _DataTypeT | Field[_DataTypeT], -) -> LargeListViewType[_DataTypeT]: - """ - Create LargeListViewType instance from child data type or field. - - This data type may not be supported by all Arrow implementations - because it is an alternative to the ListType. - - Parameters - ---------- - value_type : DataType or Field - - Returns - ------- - list_view_type : DataType - - Examples - -------- - Create an instance of LargeListViewType: - - >>> import pyarrow as pa - >>> pa.large_list_view(pa.int8()) - LargeListViewType(large_list_view) - """ - -@overload -def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... -@overload -def map_( - key_type: _K, item_type: _ValueT, key_sorted: _Ordered -) -> MapType[_K, _ValueT, _Ordered]: ... -def map_(*args, **kwargs): - """ - Create MapType instance from key and item data types or fields. - - Parameters - ---------- - key_type : DataType or Field - item_type : DataType or Field - keys_sorted : bool - - Returns - ------- - map_type : DataType - - Examples - -------- - Create an instance of MapType: - - >>> import pyarrow as pa - >>> pa.map_(pa.string(), pa.int32()) - MapType(map) - >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) - MapType(map) - - Use MapType to create an array: - - >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] - >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) - - [ - keys: - [ - "a", - "b" - ] - values: - [ - 1, - 2 - ], - keys: - [ - "c" - ] - values: - [ - 3 - ] - ] - """ - -@overload -def dictionary( - index_type: _IndexT, value_type: _BasicValueT -) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... -@overload -def dictionary( - index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered -) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... -def dictionary(*args, **kwargs): - """ - Dictionary (categorical, or simply encoded) type. - - Parameters - ---------- - index_type : DataType - value_type : DataType - ordered : bool - - Returns - ------- - type : DictionaryType - - Examples - -------- - Create an instance of dictionary type: - - >>> import pyarrow as pa - >>> pa.dictionary(pa.int64(), pa.utf8()) - DictionaryType(dictionary) - - Use dictionary type to create an array: - - >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) - - ... - -- dictionary: - [ - "a", - "b", - "d" - ] - -- indices: - [ - 0, - 1, - null, - 2 - ] - """ - -def struct( - fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] - | Mapping[str, Field[Any]], -) -> StructType: - """ - Create StructType instance from fields. - - A struct is a nested type parameterized by an ordered sequence of types - (which can all be distinct), called its fields. - - Parameters - ---------- - fields : iterable of Fields or tuples, or mapping of strings to DataTypes - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - - Examples - -------- - Create an instance of StructType from an iterable of tuples: - - >>> import pyarrow as pa - >>> fields = [ - ... ("f1", pa.int32()), - ... ("f2", pa.string()), - ... ] - >>> struct_type = pa.struct(fields) - >>> struct_type - StructType(struct) - - Retrieve a field from a StructType: - - >>> struct_type[0] - pyarrow.Field - >>> struct_type["f1"] - pyarrow.Field - - Create an instance of StructType from an iterable of Fields: - - >>> fields = [ - ... pa.field("f1", pa.int32()), - ... pa.field("f2", pa.string(), nullable=False), - ... ] - >>> pa.struct(fields) - StructType(struct) - - Returns - ------- - type : DataType - """ - -def sparse_union( - child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> SparseUnionType: - """ - Create SparseUnionType from child fields. - - A sparse union is a nested type where each logical value is taken from - a single child. A buffer of 8-bit type ids indicates which child - a given logical value is to be taken from. - - In a sparse union, each child array should have the same length as the - union array, regardless of the actual number of union values that - refer to it. - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - type_codes : list of integers, default None - - Returns - ------- - type : SparseUnionType - """ - -def dense_union( - child_fields: list[Field[Any]], type_codes: list[int] | None = None -) -> DenseUnionType: - """ - Create DenseUnionType from child fields. - - A dense union is a nested type where each logical value is taken from - a single child, at a specific offset. A buffer of 8-bit type ids - indicates which child a given logical value is to be taken from, - and a buffer of 32-bit offsets indicates at which physical position - in the given child array the logical value is to be taken from. - - Unlike a sparse union, a dense union allows encoding only the child array - values which are actually referred to by the union array. This is - counterbalanced by the additional footprint of the offsets buffer, and - the additional indirection cost when looking up values. - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - type_codes : list of integers, default None - - Returns - ------- - type : DenseUnionType - """ - -@overload -def union( - child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None -) -> SparseUnionType: ... -@overload -def union( - child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None -) -> DenseUnionType: ... -def union(*args, **kwargs): - """ - Create UnionType from child fields. - - A union is a nested type where each logical value is taken from a - single child. A buffer of 8-bit type ids indicates which child - a given logical value is to be taken from. - - Unions come in two flavors: sparse and dense - (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). - - Parameters - ---------- - child_fields : sequence of Field values - Each field must have a UTF8-encoded name, and these field names are - part of the type metadata. - mode : str - Must be 'sparse' or 'dense' - type_codes : list of integers, default None - - Returns - ------- - type : UnionType - """ - -def run_end_encoded( - run_end_type: _RunEndType, value_type: _BasicValueT -) -> RunEndEncodedType[_RunEndType, _BasicValueT]: - """ - Create RunEndEncodedType from run-end and value types. - - Parameters - ---------- - run_end_type : pyarrow.DataType - The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. - value_type : pyarrow.DataType - The type of the values array. - - Returns - ------- - type : RunEndEncodedType - """ - -def json_(storage_type: DataType = ...) -> JsonType: - """ - Create instance of JSON extension type. - - Parameters - ---------- - storage_type : DataType, default pyarrow.string() - The underlying data type. Can be on of the following types: - string, large_string, string_view. - - Returns - ------- - type : JsonType - - Examples - -------- - Create an instance of JSON extension type: - - >>> import pyarrow as pa - >>> pa.json_(pa.utf8()) - JsonType(extension) - - Use the JSON type to create an array: - - >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) - - [ - "{"a": 1}", - "{"b": 2}" - ] - """ - -def uuid() -> UuidType: - """ - Create UuidType instance. - - Returns - ------- - type : UuidType - """ - -def fixed_shape_tensor( - value_type: _ValueT, - shape: Sequence[int], - dim_names: Sequence[str] | None = None, - permutation: Sequence[int] | None = None, -) -> FixedShapeTensorType[_ValueT]: - """ - Create instance of fixed shape tensor extension type with shape and optional - names of tensor dimensions and indices of the desired logical - ordering of dimensions. - - Parameters - ---------- - value_type : DataType - Data type of individual tensor elements. - shape : tuple or list of integers - The physical shape of the contained tensors. - dim_names : tuple or list of strings, default None - Explicit names to tensor dimensions. - permutation : tuple or list integers, default None - Indices of the desired ordering of the original dimensions. - The indices contain a permutation of the values ``[0, 1, .., N-1]`` where - N is the number of dimensions. The permutation indicates which dimension - of the logical layout corresponds to which dimension of the physical tensor. - For more information on this parameter see - :ref:`fixed_shape_tensor_extension`. - - Examples - -------- - Create an instance of fixed shape tensor extension type: - - >>> import pyarrow as pa - >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) - >>> tensor_type - FixedShapeTensorType(extension) - - Inspect the data type: - - >>> tensor_type.value_type - DataType(int32) - >>> tensor_type.shape - [2, 2] - - Create a table with fixed shape tensor extension array: - - >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] - >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) - >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) - >>> pa.table([tensor], names=["tensor_array"]) - pyarrow.Table - tensor_array: extension - ---- - tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] - - Create an instance of fixed shape tensor extension type with names - of tensor dimensions: - - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) - >>> tensor_type.dim_names - ['C', 'H', 'W'] - - Create an instance of fixed shape tensor extension type with - permutation: - - >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - - Returns - ------- - type : FixedShapeTensorType - """ - -def bool8() -> Bool8Type: - """ - Create instance of bool8 extension type. - - Examples - -------- - Create an instance of bool8 extension type: - - >>> import pyarrow as pa - >>> type = pa.bool8() - >>> type - Bool8Type(extension) - - Inspect the data type: - - >>> type.storage_type - DataType(int8) - - Create a table with a bool8 array: - - >>> arr = [-1, 0, 1, 2, None] - >>> storage = pa.array(arr, pa.int8()) - >>> other = pa.ExtensionArray.from_storage(type, storage) - >>> pa.table([other], names=["unknown_col"]) - pyarrow.Table - unknown_col: extension - ---- - unknown_col: [[-1,0,1,2,null]] - - Returns - ------- - type : Bool8Type - """ - -def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: - """ - Create instance of opaque extension type. - - Parameters - ---------- - storage_type : DataType - The underlying data type. - type_name : str - The name of the type in the external system. - vendor_name : str - The name of the external system. - - Examples - -------- - Create an instance of an opaque extension type: - - >>> import pyarrow as pa - >>> type = pa.opaque(pa.binary(), "other", "jdbc") - >>> type - OpaqueType(extension) - - Inspect the data type: - - >>> type.storage_type - DataType(binary) - >>> type.type_name - 'other' - >>> type.vendor_name - 'jdbc' - - Create a table with an opaque array: - - >>> arr = [None, b"foobar"] - >>> storage = pa.array(arr, pa.binary()) - >>> other = pa.ExtensionArray.from_storage(type, storage) - >>> pa.table([other], names=["unknown_col"]) - pyarrow.Table - unknown_col: extension - ---- - unknown_col: [[null,666F6F626172]] - - Returns - ------- - type : OpaqueType - """ - -@overload -def type_for_alias(name: Literal["null"]) -> NullType: ... -@overload -def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... -@overload -def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... -@overload -def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... -@overload -def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... -@overload -def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... -@overload -def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... -@overload -def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... -@overload -def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... -@overload -def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... -@overload -def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... -@overload -def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... -@overload -def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... -@overload -def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... -@overload -def type_for_alias(name: Literal["binary"]) -> BinaryType: ... -@overload -def type_for_alias( - name: Literal["large_string", "large_str", "large_utf8"], -) -> LargeStringType: ... -@overload -def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... -@overload -def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... -@overload -def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... -@overload -def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... -@overload -def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... -@overload -def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... -@overload -def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... -@overload -def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... -@overload -def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... -@overload -def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... -@overload -def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... -@overload -def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... -@overload -def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... -@overload -def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... -@overload -def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... -@overload -def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... -def type_for_alias(name): - """ - Return DataType given a string alias if one exists. - - Parameters - ---------- - name : str - The alias of the DataType that should be retrieved. - - Returns - ------- - type : DataType - """ - -@overload -def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... -@overload -def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... -@overload -def ensure_type(ty: Literal["null"]) -> NullType: ... -@overload -def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... -@overload -def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... -@overload -def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... -@overload -def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... -@overload -def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... -@overload -def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... -@overload -def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... -@overload -def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... -@overload -def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... -@overload -def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... -@overload -def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... -@overload -def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... -@overload -def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... -@overload -def ensure_type(ty: Literal["binary"]) -> BinaryType: ... -@overload -def ensure_type( - ty: Literal["large_string", "large_str", "large_utf8"], -) -> LargeStringType: ... -@overload -def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... -@overload -def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... -@overload -def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... -@overload -def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... -@overload -def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... -@overload -def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... -@overload -def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... -@overload -def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... -@overload -def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... -@overload -def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... -@overload -def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... -@overload -def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... -@overload -def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... -@overload -def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... -@overload -def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... -@overload -def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... -def schema( - fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], - metadata: dict[bytes | str, bytes | str] | None = None, -) -> Schema: - """ - Construct pyarrow.Schema from collection of fields. - - Parameters - ---------- - fields : iterable of Fields or tuples, or mapping of strings to DataTypes - Can also pass an object that implements the Arrow PyCapsule Protocol - for schemas (has an ``__arrow_c_schema__`` method). - metadata : dict, default None - Keys and values must be coercible to bytes. - - Examples - -------- - Create a Schema from iterable of tuples: - - >>> import pyarrow as pa - >>> pa.schema( - ... [ - ... ("some_int", pa.int32()), - ... ("some_string", pa.string()), - ... pa.field("some_required_string", pa.string(), nullable=False), - ... ] - ... ) - some_int: int32 - some_string: string - some_required_string: string not null - - Create a Schema from iterable of Fields: - - >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) - some_int: int32 - some_string: string - - DataTypes can also be passed as strings. The following is equivalent to the - above example: - - >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) - some_int: int32 - some_string: string - - Or more concisely: - - >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) - some_int: int32 - some_string: string - - Returns - ------- - schema : pyarrow.Schema - """ - -def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: - """ - Convert NumPy dtype to pyarrow.DataType. - - Parameters - ---------- - dtype : the numpy dtype to convert - - - Examples - -------- - Create a pyarrow DataType from NumPy dtype: - - >>> import pyarrow as pa - >>> import numpy as np - >>> pa.from_numpy_dtype(np.dtype("float16")) - DataType(halffloat) - >>> pa.from_numpy_dtype("U") - DataType(string) - >>> pa.from_numpy_dtype(bool) - DataType(bool) - >>> pa.from_numpy_dtype(np.str_) - DataType(string) - """ - -def is_boolean_value(obj: Any) -> bool: - """ - Check if the object is a boolean. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_integer_value(obj: Any) -> bool: - """ - Check if the object is an integer. - - Parameters - ---------- - obj : object - The object to check - """ - -def is_float_value(obj: Any) -> bool: - """ - Check if the object is a float. - - Parameters - ---------- - obj : object - The object to check - """ - -__all__ = [ - "_Weakrefable", - "_Metadata", - "DataType", - "_BasicDataType", - "NullType", - "BoolType", - "UInt8Type", - "Int8Type", - "UInt16Type", - "Int16Type", - "Uint32Type", - "Int32Type", - "UInt64Type", - "Int64Type", - "Float16Type", - "Float32Type", - "Float64Type", - "Date32Type", - "Date64Type", - "MonthDayNanoIntervalType", - "StringType", - "LargeStringType", - "StringViewType", - "BinaryType", - "LargeBinaryType", - "BinaryViewType", - "TimestampType", - "Time32Type", - "Time64Type", - "DurationType", - "FixedSizeBinaryType", - "Decimal32Type", - "Decimal64Type", - "Decimal128Type", - "Decimal256Type", - "ListType", - "LargeListType", - "ListViewType", - "LargeListViewType", - "FixedSizeListType", - "DictionaryMemo", - "DictionaryType", - "MapType", - "StructType", - "UnionType", - "SparseUnionType", - "DenseUnionType", - "RunEndEncodedType", - "BaseExtensionType", - "ExtensionType", - "FixedShapeTensorType", - "Bool8Type", - "UuidType", - "JsonType", - "OpaqueType", - "PyExtensionType", - "UnknownExtensionType", - "register_extension_type", - "unregister_extension_type", - "KeyValueMetadata", - "ensure_metadata", - "Field", - "Schema", - "unify_schemas", - "field", - "null", - "bool_", - "uint8", - "int8", - "uint16", - "int16", - "uint32", - "int32", - "int64", - "uint64", - "tzinfo_to_string", - "string_to_tzinfo", - "timestamp", - "time32", - "time64", - "duration", - "month_day_nano_interval", - "date32", - "date64", - "float16", - "float32", - "float64", - "decimal32", - "decimal64", - "decimal128", - "decimal256", - "string", - "utf8", - "binary", - "large_binary", - "large_string", - "large_utf8", - "binary_view", - "string_view", - "list_", - "large_list", - "list_view", - "large_list_view", - "map_", - "dictionary", - "struct", - "sparse_union", - "dense_union", - "union", - "run_end_encoded", - "json_", - "uuid", - "fixed_shape_tensor", - "bool8", - "opaque", - "type_for_alias", - "ensure_type", - "schema", - "from_numpy_dtype", - "is_boolean_value", - "is_integer_value", - "is_float_value", -] diff --git a/pyarrow-stubs/_azurefs.pyi b/pyarrow-stubs/_azurefs.pyi deleted file mode 100644 index 317943ce20f..00000000000 --- a/pyarrow-stubs/_azurefs.pyi +++ /dev/null @@ -1,74 +0,0 @@ -from typing import Literal - -from ._fs import FileSystem - -class AzureFileSystem(FileSystem): - """ - Azure Blob Storage backed FileSystem implementation - - This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. - Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. Azurite emulator is - also supported. Note: `/` is the only supported delimiter. - - The storage account is considered the root of the filesystem. When enabled, containers - will be created or deleted during relevant directory operations. Obviously, this also - requires authentication with the additional permissions. - - By default `DefaultAzureCredential `__ - is used for authentication. This means it will try several types of authentication - and go with the first one that works. If any authentication parameters are provided when - initialising the FileSystem, they will be used instead of the default credential. - - Parameters - ---------- - account_name : str - Azure Blob Storage account name. This is the globally unique identifier for the - storage account. - account_key : str, default None - Account key of the storage account. If sas_token and account_key are None the - default credential will be used. The parameters account_key and sas_token are - mutually exclusive. - blob_storage_authority : str, default None - hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful - for connecting to a local emulator, like Azurite. - dfs_storage_authority : str, default None - hostname[:port] of the Data Lake Gen 2 Service. Defaults to - `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. - blob_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - dfs_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like Azurite. - sas_token : str, default None - SAS token for the storage account, used as an alternative to account_key. If sas_token - and account_key are None the default credential will be used. The parameters - account_key and sas_token are mutually exclusive. - - Examples - -------- - >>> from pyarrow import fs - >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") - >>> azurite_fs = fs.AzureFileSystem( - ... account_name="devstoreaccount1", - ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", - ... blob_storage_authority="127.0.0.1:10000", - ... dfs_storage_authority="127.0.0.1:10000", - ... blob_storage_scheme="http", - ... dfs_storage_scheme="http", - ... ) - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - - def __init__( - self, - account_name: str, - account_key: str | None = None, - blob_storage_authority: str | None = None, - dfs_storage_authority: str | None = None, - blob_storage_schema: Literal["http", "https"] = "https", - dfs_storage_schema: Literal["http", "https"] = "https", - sas_token: str | None = None, - ) -> None: ... diff --git a/pyarrow-stubs/_compute.pyi b/pyarrow-stubs/_compute.pyi deleted file mode 100644 index 3d61ae42787..00000000000 --- a/pyarrow-stubs/_compute.pyi +++ /dev/null @@ -1,1721 +0,0 @@ -from typing import ( - Any, - Callable, - Iterable, - Literal, - Sequence, - TypeAlias, - TypedDict, - overload, -) - -from . import lib - -_Order: TypeAlias = Literal["ascending", "descending"] -_Placement: TypeAlias = Literal["at_start", "at_end"] - -class Kernel(lib._Weakrefable): - """ - A kernel object. - - Kernels handle the execution of a Function for a certain signature. - """ - -class Function(lib._Weakrefable): - """ - A compute function. - - A function implements a certain logical computation over a range of - possible input signatures. Each signature accepts a range of input - types and is implemented by a given Kernel. - - Functions can be of different kinds: - - * "scalar" functions apply an item-wise computation over all items - of their inputs. Each item in the output only depends on the values - of the inputs at the same position. Examples: addition, comparisons, - string predicates... - - * "vector" functions apply a collection-wise computation, such that - each item in the output may depend on the values of several items - in each input. Examples: dictionary encoding, sorting, extracting - unique values... - - * "scalar_aggregate" functions reduce the dimensionality of the inputs by - applying a reduction function. Examples: sum, min_max, mode... - - * "hash_aggregate" functions apply a reduction function to an input - subdivided by grouping criteria. They may not be directly called. - Examples: hash_sum, hash_min_max... - - * "meta" functions dispatch to other functions. - """ - @property - def arity(self) -> int: - """ - The function arity. - - If Ellipsis (i.e. `...`) is returned, the function takes a variable - number of arguments. - """ - @property - def kind( - self, - ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: - """ - The function kind. - """ - @property - def name(self) -> str: - """ - The function name. - """ - @property - def num_kernels(self) -> int: - """ - The number of kernels implementing this function. - """ - def call( - self, - args: Iterable, - options: FunctionOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - length: int | None = None, - ) -> Any: - """ - Call the function on the given arguments. - - Parameters - ---------- - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. - options : FunctionOptions, optional - Options instance for executing this function. This should have - the right concrete options type. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If - not passed, will be inferred from passed data. - """ - -class FunctionOptions(lib._Weakrefable): - def serialize(self) -> lib.Buffer: ... - @classmethod - def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... - -class FunctionRegistry(lib._Weakrefable): - def get_function(self, name: str) -> Function: - """ - Look up a function by name in the registry. - - Parameters - ---------- - name : str - The name of the function to lookup - """ - - def list_functions(self) -> list[str]: - """ - Return all function names in the registry. - """ - -class HashAggregateFunction(Function): ... -class HashAggregateKernel(Kernel): ... -class ScalarAggregateFunction(Function): ... -class ScalarAggregateKernel(Kernel): ... -class ScalarFunction(Function): ... -class ScalarKernel(Kernel): ... -class VectorFunction(Function): ... -class VectorKernel(Kernel): ... - -# ==================== _compute.pyx Option classes ==================== -class ArraySortOptions(FunctionOptions): - """ - Options for the `array_sort_indices` function. - - Parameters - ---------- - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - """ - def __init__( - self, - order: _Order = "ascending", - null_placement: _Placement = "at_end", - ) -> None: ... - -class AssumeTimezoneOptions(FunctionOptions): - """ - Options for the `assume_timezone` function. - - Parameters - ---------- - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - """ - - def __init__( - self, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - ) -> None: ... - -class CastOptions(FunctionOptions): - """ - Options for the `cast` function. - - Parameters - ---------- - target_type : DataType, optional - The PyArrow type to cast to. - allow_int_overflow : bool, default False - Whether integer overflow is allowed when casting. - allow_time_truncate : bool, default False - Whether time precision truncation is allowed when casting. - allow_time_overflow : bool, default False - Whether date/time range overflow is allowed when casting. - allow_decimal_truncate : bool, default False - Whether decimal precision truncation is allowed when casting. - allow_float_truncate : bool, default False - Whether floating-point precision truncation is allowed when casting. - allow_invalid_utf8 : bool, default False - Whether producing invalid utf8 data is allowed when casting. - """ - - allow_int_overflow: bool - allow_time_truncate: bool - allow_time_overflow: bool - allow_decimal_truncate: bool - allow_float_truncate: bool - allow_invalid_utf8: bool - - def __init__( - self, - target_type: lib.DataType | None = None, - *, - allow_int_overflow: bool | None = None, - allow_time_truncate: bool | None = None, - allow_time_overflow: bool | None = None, - allow_decimal_truncate: bool | None = None, - allow_float_truncate: bool | None = None, - allow_invalid_utf8: bool | None = None, - ) -> None: ... - @staticmethod - def safe(target_type: lib.DataType | None = None) -> CastOptions: ... - @staticmethod - def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... - def is_safe(self) -> bool: ... - -class CountOptions(FunctionOptions): - """ - Options for the `count` function. - - Parameters - ---------- - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - """ - def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... - -class CumulativeOptions(FunctionOptions): - """ - Options for `cumulative_*` functions. - - - cumulative_sum - - cumulative_sum_checked - - cumulative_prod - - cumulative_prod_checked - - cumulative_max - - cumulative_min - - Parameters - ---------- - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... - -class CumulativeSumOptions(FunctionOptions): - """ - Options for `cumulative_sum` function. - - Parameters - ---------- - start : Scalar, default None - Starting value for sum computation - skip_nulls : bool, default False - When false, the first encountered null is propagated. - """ - def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... - -class DayOfWeekOptions(FunctionOptions): - """ - Options for the `day_of_week` function. - - Parameters - ---------- - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - """ - - def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... - -class DictionaryEncodeOptions(FunctionOptions): - """ - Options for dictionary encoding. - - Parameters - ---------- - null_encoding : str, default "mask" - How to encode nulls in the input. - Accepted values are "mask" (null inputs emit a null in the indices - array), "encode" (null inputs emit a non-null index pointing to - a null value in the dictionary array). - """ - def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... - -class RunEndEncodeOptions(FunctionOptions): - """ - Options for run-end encoding. - - Parameters - ---------- - run_end_type : DataType, default pyarrow.int32() - The data type of the run_ends array. - - Accepted values are pyarrow.{int16(), int32(), int64()}. - """ - # TODO: default is DataType(int32) - def __init__(self, run_end_type: lib.DataType = ...) -> None: ... - -class ElementWiseAggregateOptions(FunctionOptions): - """ - Options for element-wise aggregate functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - """ - def __init__(self, *, skip_nulls: bool = True) -> None: ... - -class ExtractRegexOptions(FunctionOptions): - """ - Options for the `extract_regex` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ - def __init__(self, pattern: str) -> None: ... - -class ExtractRegexSpanOptions(FunctionOptions): - """ - Options for the `extract_regex_span` function. - - Parameters - ---------- - pattern : str - Regular expression with named capture fields. - """ - def __init__(self, pattern: str) -> None: ... - -class FilterOptions(FunctionOptions): - """ - Options for selecting with a boolean filter. - - Parameters - ---------- - null_selection_behavior : str, default "drop" - How to handle nulls in the selection filter. - Accepted values are "drop", "emit_null". - """ - - def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... - -class IndexOptions(FunctionOptions): - """ - Options for the `index` function. - - Parameters - ---------- - value : Scalar - The value to search for. - """ - def __init__(self, value: lib.Scalar) -> None: ... - -class JoinOptions(FunctionOptions): - """ - Options for the `binary_join_element_wise` function. - - Parameters - ---------- - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - """ - @overload - def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... - @overload - def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... - -class ListSliceOptions(FunctionOptions): - """ - Options for list array slicing. - - Parameters - ---------- - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - """ - def __init__( - self, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - ) -> None: ... - -class ListFlattenOptions(FunctionOptions): - """ - Options for `list_flatten` function - - Parameters - ---------- - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - """ - def __init__(self, recursive: bool = False) -> None: ... - -class MakeStructOptions(FunctionOptions): - """ - Options for the `make_struct` function. - - Parameters - ---------- - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - """ - def __init__( - self, - field_names: Sequence[str] = (), - *, - field_nullability: Sequence[bool] | None = None, - field_metadata: Sequence[lib.KeyValueMetadata] | None = None, - ) -> None: ... - -class MapLookupOptions(FunctionOptions): - """ - Options for the `map_lookup` function. - - Parameters - ---------- - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - """ - # TODO: query_key: Scalar or Object can be converted to Scalar - def __init__( - self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] - ) -> None: ... - -class MatchSubstringOptions(FunctionOptions): - """ - Options for looking for a substring. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - """ - - def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... - -class ModeOptions(FunctionOptions): - """ - Options for the `mode` function. - - Parameters - ---------- - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... - -class NullOptions(FunctionOptions): - """ - Options for the `is_null` function. - - Parameters - ---------- - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - """ - def __init__(self, *, nan_is_null: bool = False) -> None: ... - -class PadOptions(FunctionOptions): - """ - Options for padding strings. - - Parameters - ---------- - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - """ - def __init__( - self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True - ) -> None: ... - -class PairwiseOptions(FunctionOptions): - """ - Options for `pairwise` functions. - - Parameters - ---------- - period : int, default 1 - Period for applying the period function. - """ - def __init__(self, period: int = 1) -> None: ... - -class PartitionNthOptions(FunctionOptions): - """ - Options for the `partition_nth_indices` function. - - Parameters - ---------- - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - """ - def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... - -class WinsorizeOptions(FunctionOptions): - """ - Options for the `winsorize` function. - - Parameters - ---------- - lower_limit : float, between 0 and 1 - The quantile below which all values are replaced with the quantile's value. - upper_limit : float, between 0 and 1 - The quantile above which all values are replaced with the quantile's value. - """ - def __init__(self, lower_limit: float, upper_limit: float) -> None: ... - -class QuantileOptions(FunctionOptions): - """ - Options for the `quantile` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, - q: float | Sequence[float], - *, - interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", - skip_nulls: bool = True, - min_count: int = 0, - ) -> None: ... - -class RandomOptions(FunctionOptions): - """ - Options for random generation. - - Parameters - ---------- - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - """ - def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... - -class RankOptions(FunctionOptions): - """ - Options for the `rank` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - """ - def __init__( - self, - sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", - *, - null_placement: _Placement = "at_end", - tiebreaker: Literal["min", "max", "first", "dense"] = "first", - ) -> None: ... - -class RankQuantileOptions(FunctionOptions): - """ - Options for the `rank_quantile` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - """ - - def __init__( - self, - sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", - *, - null_placement: _Placement = "at_end", - ) -> None: ... - -class PivotWiderOptions(FunctionOptions): - """ - Options for the `pivot_wider` function. - - Parameters - ---------- - key_names : sequence of str - The pivot key names expected in the pivot key column. - For each entry in `key_names`, a column with the same name is emitted - in the struct output. - unexpected_key_behavior : str, default "ignore" - The behavior when pivot keys not in `key_names` are encountered. - Accepted values are "ignore", "raise". - If "ignore", unexpected keys are silently ignored. - If "raise", unexpected keys raise a KeyError. - """ - def __init__( - self, - key_names: Sequence[str], - *, - unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", - ) -> None: ... - -class ReplaceSliceOptions(FunctionOptions): - """ - Options for replacing slices. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - """ - def __init__(self, start: int, stop: int, replacement: str) -> None: ... - -class ReplaceSubstringOptions(FunctionOptions): - """ - Options for replacing matched substrings. - - Parameters - ---------- - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - """ - def __init__( - self, pattern: str, replacement: str, *, max_replacements: int | None = None - ) -> None: ... - -_RoundMode: TypeAlias = Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", -] - -class RoundBinaryOptions(FunctionOptions): - """ - Options for rounding numbers when ndigits is provided by a second array - - Parameters - ---------- - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__( - self, - round_mode: _RoundMode = "half_to_even", - ) -> None: ... - -class RoundOptions(FunctionOptions): - """ - Options for rounding numbers. - - Parameters - ---------- - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__( - self, - ndigits: int = 0, - round_mode: _RoundMode = "half_to_even", - ) -> None: ... - -_DateTimeUint: TypeAlias = Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", -] - -class RoundTemporalOptions(FunctionOptions): - """ - Options for rounding temporal values. - - Parameters - ---------- - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - """ - def __init__( - self, - multiple: int = 1, - unit: _DateTimeUint = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - ) -> None: ... - -class RoundToMultipleOptions(FunctionOptions): - """ - Options for rounding numbers to a multiple. - - Parameters - ---------- - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - """ - def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... - -class ScalarAggregateOptions(FunctionOptions): - """ - Options for scalar aggregations. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... - -class SelectKOptions(FunctionOptions): - """ - Options for top/bottom k-selection. - - Parameters - ---------- - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - """ - - def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... - -class SetLookupOptions(FunctionOptions): - """ - Options for the `is_in` and `index_in` functions. - - Parameters - ---------- - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - """ - def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... - -class SliceOptions(FunctionOptions): - """ - Options for slicing. - - Parameters - ---------- - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - """ - - def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... - -class SortOptions(FunctionOptions): - """ - Options for the `sort_indices` function. - - Parameters - ---------- - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - """ - def __init__( - self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" - ) -> None: ... - -class SplitOptions(FunctionOptions): - """ - Options for splitting on whitespace. - - Parameters - ---------- - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ - - def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... - -class SplitPatternOptions(FunctionOptions): - """ - Options for splitting on a string pattern. - - Parameters - ---------- - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - """ - def __init__( - self, pattern: str, *, max_splits: int | None = None, reverse: bool = False - ) -> None: ... - -class StrftimeOptions(FunctionOptions): - """ - Options for the `strftime` function. - - Parameters - ---------- - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - """ - def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... - -class StrptimeOptions(FunctionOptions): - """ - Options for the `strptime` function. - - Parameters - ---------- - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - """ - def __init__( - self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False - ) -> None: ... - -class StructFieldOptions(FunctionOptions): - """ - Options for the `struct_field` function. - - Parameters - ---------- - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - """ - def __init__( - self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int - ) -> None: ... - -class TakeOptions(FunctionOptions): - """ - Options for the `take` and `array_take` functions. - - Parameters - ---------- - boundscheck : boolean, default True - Whether to check indices are within bounds. If False and an - index is out of bounds, behavior is undefined (the process - may crash). - """ - def __init__(self, boundscheck: bool = True) -> None: ... - -class TDigestOptions(FunctionOptions): - """ - Options for the `tdigest` function. - - Parameters - ---------- - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, - q: float | Sequence[float] = 0.5, - *, - delta: int = 100, - buffer_size: int = 500, - skip_nulls: bool = True, - min_count: int = 0, - ) -> None: ... - -class TrimOptions(FunctionOptions): - """ - Options for trimming characters from strings. - - Parameters - ---------- - characters : str - Individual characters to be trimmed from the string. - """ - def __init__(self, characters: str) -> None: ... - -class Utf8NormalizeOptions(FunctionOptions): - """ - Options for the `utf8_normalize` function. - - Parameters - ---------- - form : str - Unicode normalization form. - Accepted values are "NFC", "NFKC", "NFD", NFKD". - """ - - def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... - -class VarianceOptions(FunctionOptions): - """ - Options for the `variance` and `stddev` functions. - - Parameters - ---------- - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... - -class SkewOptions(FunctionOptions): - """ - Options for the `skew` and `kurtosis` functions. - - Parameters - ---------- - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - biased : bool, default True - Whether the calculated value is biased. - If False, the value computed includes a correction factor to reduce bias. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - """ - def __init__( - self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 - ) -> None: ... - -class WeekOptions(FunctionOptions): - """ - Options for the `week` function. - - Parameters - ---------- - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - """ - def __init__( - self, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - ) -> None: ... - -# ==================== _compute.pyx Functions ==================== - -def call_function( - name: str, - args: list, - options: FunctionOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - length: int | None = None, -) -> Any: - """ - Call a named function. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to call. - args : list - The arguments to the function. - options : optional - options provided to the function. - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - length : int, optional - Batch size for execution, for nullary (no argument) functions. If not - passed, inferred from data. - """ - -def function_registry() -> FunctionRegistry: ... -def get_function(name: str) -> Function: - """ - Get a function by name. - - The function is looked up in the global registry - (as returned by `function_registry()`). - - Parameters - ---------- - name : str - The name of the function to lookup - """ - -def list_functions() -> list[str]: - """ - Return all function names in the global registry. - """ - -# ==================== _compute.pyx Udf ==================== - -def call_tabular_function( - function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None -) -> lib.RecordBatchReader: - """ - Get a record batch iterator from a tabular function. - - Parameters - ---------- - function_name : str - Name of the function. - args : iterable - The arguments to pass to the function. Accepted types depend - on the specific function. Currently, only an empty args is supported. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ - -class _FunctionDoc(TypedDict): - summary: str - description: str - -def register_scalar_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined scalar function. - - This API is EXPERIMENTAL. - - A scalar function is a function that executes elementwise - operations on arrays or scalars, i.e. a scalar function must - be computed row-by-row with no state where each output row - is computed only from its corresponding input row. - In other words, all argument arrays have the same length, - and the output array is of the same length as the arguments. - Scalar functions are the only functions allowed in query engine - expressions. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple udf" - >>> func_doc["description"] = "add a constant to a scalar" - >>> - >>> def add_constant(ctx, array): - ... return pc.add(array, 1, memory_pool=ctx.memory_pool) - >>> - >>> func_name = "py_add_func" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.int64() - >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_add_func' - >>> answer = pc.call_function(func_name, [pa.array([20])]) - >>> answer - - [ - 21 - ] - """ - -def register_tabular_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined tabular function. - - This API is EXPERIMENTAL. - - A tabular function is one accepting a context argument of type - UdfContext and returning a generator of struct arrays. - The in_types argument must be empty and the out_type argument - specifies a schema. Each struct array must have field types - corresponding to the schema. - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The only argument is the context argument of type - UdfContext. It must return a callable that - returns on each invocation a StructArray matching - the out_type, where an empty array indicates end. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - Must be an empty dictionary (reserved for future use). - out_type : Union[Schema, DataType] - Schema of the function's output, or a corresponding flat struct type. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - """ - -def register_aggregate_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined non-decomposable aggregate function. - - This API is EXPERIMENTAL. - - A non-decomposable aggregation function is a function that executes - aggregate operations on the whole data that it is aggregating. - In other words, non-decomposable aggregate function cannot be - split into consume/merge/finalize steps. - - This is often used with ordered or segmented aggregation where groups - can be emit before accumulating all of the input data. - - Note that currently the size of any input column cannot exceed 2 GB - for a single segment (all groups combined). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return a Scalar matching the - out_type. - To define a varargs function, pass a callable that takes - *args. The in_type needs to match in type of inputs when - the function gets called. - function_name : str - Name of the function. This name must be unique, i.e., - there should only be one function registered with - this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import numpy as np - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "simple median udf" - >>> func_doc["description"] = "compute median" - >>> - >>> def compute_median(ctx, array): - ... return pa.scalar(np.median(array)) - >>> - >>> func_name = "py_compute_median" - >>> in_types = {"array": pa.int64()} - >>> out_type = pa.float64() - >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) - >>> - >>> func = pc.get_function(func_name) - >>> func.name - 'py_compute_median' - >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) - >>> answer - - >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) - >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) - >>> result - pyarrow.Table - k: int64 - v_py_compute_median: double - ---- - k: [[1,2]] - v_py_compute_median: [[15,35]] - """ - -def register_vector_function( - func: Callable, - function_name: str, - function_doc: _FunctionDoc, - in_types: dict[str, lib.DataType], - out_type: lib.DataType, - func_registry: FunctionRegistry | None = None, -) -> None: - """ - Register a user-defined vector function. - - This API is EXPERIMENTAL. - - A vector function is a function that executes vector - operations on arrays. Vector function is often used - when compute doesn't fit other more specific types of - functions (e.g., scalar and aggregate). - - Parameters - ---------- - func : callable - A callable implementing the user-defined function. - The first argument is the context argument of type - UdfContext. - Then, it must take arguments equal to the number of - in_types defined. It must return an Array or Scalar - matching the out_type. It must return a Scalar if - all arguments are scalar, else it must return an Array. - - To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs - arguments. - function_name : str - Name of the function. There should only be one function - registered with this name in the function registry. - function_doc : dict - A dictionary object with keys "summary" (str), - and "description" (str). - in_types : Dict[str, DataType] - A dictionary mapping function argument names to - their respective DataType. - The argument names will be used to generate - documentation for the function. The number of - arguments specified here determines the function - arity. - out_type : DataType - Output type of the function. - func_registry : FunctionRegistry - Optional function registry to use instead of the default global one. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> - >>> func_doc = {} - >>> func_doc["summary"] = "percent rank" - >>> func_doc["description"] = "compute percent rank" - >>> - >>> def list_flatten_udf(ctx, x): - ... return pc.list_flatten(x) - >>> - >>> func_name = "list_flatten_udf" - >>> in_types = {"array": pa.list_(pa.int64())} - >>> out_type = pa.int64() - >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) - >>> - >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) - >>> answer - - [ - 1, - 2, - 3, - 4 - ] - """ - -class UdfContext: - """ - Per-invocation function context/state. - - This object will always be the first argument to a user-defined - function. It should not be used outside of a call to the function. - """ - - @property - def batch_length(self) -> int: - """ - The common length of all input arguments (int). - - In the case that all arguments are scalars, this value - is used to pass the "actual length" of the arguments, - e.g. because the scalar values are encoding a column - with a constant value. - """ - @property - def memory_pool(self) -> lib.MemoryPool: - """ - A memory pool for allocations (:class:`MemoryPool`). - - This is the memory pool supplied by the user when they invoked - the function and it should be used in any calls to arrow that the - UDF makes if that call accepts a memory_pool. - """ - -# ==================== _compute.pyx Expression ==================== -class Expression(lib._Weakrefable): - """ - A logical expression to be evaluated against some input. - - To create an expression: - - - Use the factory function ``pyarrow.compute.scalar()`` to create a - scalar (not necessary when combined, see example below). - - Use the factory function ``pyarrow.compute.field()`` to reference - a field (column in table). - - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. - - Combine expressions using python operators ``&`` (logical and), - ``|`` (logical or) and ``~`` (logical not). - Note: python keywords ``and``, ``or`` and ``not`` cannot be used - to combine expressions. - - Create expression predicates using Expression methods such as - ``pyarrow.compute.Expression.isin()``. - - Examples - -------- - - >>> import pyarrow.compute as pc - >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) - 7))> - >>> pc.field("a") != 3 - - >>> pc.field("a").isin([1, 2, 3]) - - """ - - @staticmethod - def from_substrait(buffer: bytes | lib.Buffer) -> Expression: - """ - Deserialize an expression from Substrait - - The serialized message must be an ExtendedExpression message that has - only a single expression. The name of the expression and the schema - the expression was bound to will be ignored. Use - pyarrow.substrait.deserialize_expressions if this information is needed - or if the message might contain multiple expressions. - - Parameters - ---------- - message : bytes or Buffer or a protobuf Message - The Substrait message to deserialize - - Returns - ------- - Expression - The deserialized expression - """ - def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: - """ - Serialize the expression using Substrait - - The expression will be serialized as an ExtendedExpression message that has a - single expression named "expression" - - Parameters - ---------- - schema : Schema - The input schema the expression will be bound to - allow_arrow_extensions : bool, default False - If False then only functions that are part of the core Substrait function - definitions will be allowed. Set this to True to allow pyarrow-specific functions - but the result may not be accepted by other compute libraries. - - Returns - ------- - Buffer - A buffer containing the serialized Protobuf plan. - """ - def __invert__(self) -> Expression: ... - def __and__(self, other) -> Expression: ... - def __or__(self, other) -> Expression: ... - def __add__(self, other) -> Expression: ... - def __mul__(self, other) -> Expression: ... - def __sub__(self, other) -> Expression: ... - def __eq__(self, value: object) -> Expression: ... # type: ignore[override] - def __ne__(self, value: object) -> Expression: ... # type: ignore[override] - def __gt__(self, value: object) -> Expression: ... # type: ignore[override] - def __lt__(self, value: object) -> Expression: ... # type: ignore[override] - def __ge__(self, value: object) -> Expression: ... # type: ignore[override] - def __le__(self, value: object) -> Expression: ... # type: ignore[override] - def __truediv__(self, other) -> Expression: ... - def is_valid(self) -> bool: - """ - Check whether the expression is not-null (valid). - - This creates a new expression equivalent to calling the - `is_valid` compute function on this expression. - - Returns - ------- - is_valid : Expression - """ - def is_null(self, nan_is_null: bool = False) -> Expression: - """ - Check whether the expression is null. - - This creates a new expression equivalent to calling the - `is_null` compute function on this expression. - - Parameters - ---------- - nan_is_null : boolean, default False - Whether floating-point NaNs are considered null. - - Returns - ------- - is_null : Expression - """ - def is_nan(self) -> Expression: - """ - Check whether the expression is NaN. - - This creates a new expression equivalent to calling the - `is_nan` compute function on this expression. - - Returns - ------- - is_nan : Expression - """ - def cast( - self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None - ) -> Expression: - """ - Explicitly set or change the expression's data type. - - This creates a new expression equivalent to calling the - `cast` compute function on this expression. - - Parameters - ---------- - type : DataType, default None - Type to cast array to. - safe : boolean, default True - Whether to check for conversion errors such as overflow. - options : CastOptions, default None - Additional checks pass by CastOptions - - Returns - ------- - cast : Expression - """ - def isin(self, values: lib.Array | Iterable) -> Expression: - """ - Check whether the expression is contained in values. - - This creates a new expression equivalent to calling the - `is_in` compute function on this expression. - - Parameters - ---------- - values : Array or iterable - The values to check for. - - Returns - ------- - isin : Expression - A new expression that, when evaluated, checks whether - this expression's value is contained in `values`. - """ - -# ==================== _compute.py ==================== diff --git a/pyarrow-stubs/_csv.pyi b/pyarrow-stubs/_csv.pyi deleted file mode 100644 index 2f49f8c9a6c..00000000000 --- a/pyarrow-stubs/_csv.pyi +++ /dev/null @@ -1,641 +0,0 @@ -from dataclasses import dataclass, field -from typing import IO, Any, Callable, Literal - -from _typeshed import StrPath - -from . import lib - -@dataclass(kw_only=True) -class ReadOptions(lib._Weakrefable): - """ - Options for reading CSV files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual record batches or table chunks. - Minimum valid value for block size is 1 - skip_rows : int, optional (default 0) - The number of rows to skip before the column names (if any) - and the CSV data. - skip_rows_after_names : int, optional (default 0) - The number of rows to skip after the column names. - This number can be larger than the number of rows in one - block, and empty rows are counted. - The order of application is as follows: - - `skip_rows` is applied (if non-zero); - - column names are read (unless `column_names` is set); - - `skip_rows_after_names` is applied (if non-zero). - column_names : list, optional - The column names of the target table. If empty, fall back on - `autogenerate_column_names`. - autogenerate_column_names : bool, optional (default False) - Whether to autogenerate column names if `column_names` is empty. - If true, column names will be of the form "f0", "f1"... - If false, column names will be read from the first CSV row - after `skip_rows`. - encoding : str, optional (default 'utf8') - The character encoding of the CSV data. Columns that cannot - decode using this encoding can still be read as Binary. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" - >>> print(s) - 1,2,3 - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - - Ignore the first numbered row and substitute it with defined - or autogenerated column names: - - >>> from pyarrow import csv - >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - f0: string - f1: int64 - f2: date32[day] - ---- - f0: [["Flamingo","Horse","Brittle stars","Centipede"]] - f1: [[2,4,5,100]] - f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - - Remove the first 2 rows of the data: - - >>> read_options = csv.ReadOptions(skip_rows_after_names=2) - >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) - pyarrow.Table - 1: string - 2: int64 - 3: date32[day] - ---- - 1: [["Brittle stars","Centipede"]] - 2: [[5,100]] - 3: [[2022-03-03,2022-03-04]] - """ - - use_threads: bool = field(default=True, kw_only=False) - block_size: int | None = None - skip_rows: int = 0 - skip_rows_after_names: int = 0 - column_names: list[str] | None = None - autogenerate_column_names: bool = False - encoding: str = "utf8" - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class ParseOptions(lib._Weakrefable): - """ - Options for parsing CSV files. - - Parameters - ---------- - delimiter : 1-character string, optional (default ',') - The character delimiting individual cells in the CSV data. - quote_char : 1-character string or False, optional (default '"') - The character used optionally for quoting CSV values - (False if quoting is not allowed). - double_quote : bool, optional (default True) - Whether two quotes in a quoted CSV value denote a single quote - in the data. - escape_char : 1-character string or False, optional (default False) - The character used optionally for escaping special characters - (False if escaping is not allowed). - newlines_in_values : bool, optional (default False) - Whether newline characters are allowed in CSV values. - Setting this to True reduces the performance of multi-threaded - CSV reading. - ignore_empty_lines : bool, optional (default True) - Whether empty lines are ignored in CSV input. - If False, an empty line is interpreted as containing a single empty - value (assuming a one-column CSV file). - invalid_row_handler : callable, optional (default None) - If not None, this object is called for each CSV row that fails - parsing (because of a mismatching number of columns). - It should accept a single InvalidRow argument and return either - "skip" or "error" depending on the desired outcome. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals;n_legs;entry\\n" - ... "Flamingo;2;2022-03-01\\n" - ... "# Comment here:\\n" - ... "Horse;4;2022-03-02\\n" - ... "Brittle stars;5;2022-03-03\\n" - ... "Centipede;100;2022-03-04" - ... ) - >>> print(s) - animals;n_legs;entry - Flamingo;2;2022-03-01 - # Comment here: - Horse;4;2022-03-02 - Brittle stars;5;2022-03-03 - Centipede;100;2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Read the data from a file skipping rows with comments - and defining the delimiter: - - >>> from pyarrow import csv - >>> def skip_comment(row): - ... if row.text.startswith("# "): - ... return "skip" - ... else: - ... return "error" - >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) - >>> csv.read_csv(source, parse_options=parse_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ - - delimiter: str = field(default=",", kw_only=False) - quote_char: str | Literal[False] = '"' - double_quote: bool = True - escape_char: str | Literal[False] = False - newlines_in_values: bool = False - ignore_empty_lines: bool = True - invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class ConvertOptions(lib._Weakrefable): - """ - Options for converting CSV data. - - Parameters - ---------- - check_utf8 : bool, optional (default True) - Whether to check UTF8 validity of string columns. - column_types : pyarrow.Schema or dict, optional - Explicitly map column names to column types. Passing this argument - disables type inference on the defined columns. - null_values : list, optional - A sequence of strings that denote nulls in the data - (defaults are appropriate in most cases). Note that by default, - string columns are not checked for null values. To enable - null checking for those, specify ``strings_can_be_null=True``. - true_values : list, optional - A sequence of strings that denote true booleans in the data - (defaults are appropriate in most cases). - false_values : list, optional - A sequence of strings that denote false booleans in the data - (defaults are appropriate in most cases). - decimal_point : 1-character string, optional (default '.') - The character used as decimal point in floating-point and decimal - data. - strings_can_be_null : bool, optional (default False) - Whether string / binary columns can have null values. - If true, then strings in null_values are considered null for - string columns. - If false, then all strings are valid string values. - quoted_strings_can_be_null : bool, optional (default True) - Whether quoted values can be null. - If true, then strings in "null_values" are also considered null - when they appear quoted in the CSV file. Otherwise, quoted values - are never considered null. - include_columns : list, optional - The names of columns to include in the Table. - If empty, the Table will include all columns from the CSV file. - If not empty, only these columns will be included, in this order. - include_missing_columns : bool, optional (default False) - If false, columns in `include_columns` but not in the CSV file will - error out. - If true, columns in `include_columns` but not in the CSV file will - produce a column of nulls (whose type is selected using - `column_types`, or null by default). - This option is ignored if `include_columns` is empty. - auto_dict_encode : bool, optional (default False) - Whether to try to automatically dict-encode string / binary data. - If true, then when type inference detects a string or binary column, - it it dict-encoded up to `auto_dict_max_cardinality` distinct values - (per chunk), after which it switches to regular encoding. - This setting is ignored for non-inferred columns (those in - `column_types`). - auto_dict_max_cardinality : int, optional - The maximum dictionary cardinality for `auto_dict_encode`. - This value is per chunk. - timestamp_parsers : list, optional - A sequence of strptime()-compatible format strings, tried in order - when attempting to infer or convert timestamp values (the special - value ISO8601() can also be given). By default, a fast built-in - ISO-8601 parser is used. - - Examples - -------- - - Defining an example data: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry,fast\\n" - ... "Flamingo,2,01/03/2022,Yes\\n" - ... "Horse,4,02/03/2022,Yes\\n" - ... "Brittle stars,5,03/03/2022,No\\n" - ... "Centipede,100,04/03/2022,No\\n" - ... ",6,05/03/2022," - ... ) - >>> print(s) - animals,n_legs,entry,fast - Flamingo,2,01/03/2022,Yes - Horse,4,02/03/2022,Yes - Brittle stars,5,03/03/2022,No - Centipede,100,04/03/2022,No - ,6,05/03/2022, - - Change the type of a column: - - >>> import pyarrow as pa - >>> from pyarrow import csv - >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: double - entry: string - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] - fast: [["Yes","Yes","No","No",""]] - - Define a date parsing format to get a timestamp type column - (in case dates are not in ISO format and not converted by default): - - >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - entry: timestamp[s] - fast: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [["Yes","Yes","No","No",""]] - - Specify a subset of columns to be read: - - >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - - List additional column to be included as a null typed column: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - location: null - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - n_legs: [[2,4,5,100,6]] - location: [5 nulls] - - Define columns as dictionary type (by default only the - string/binary columns are dictionary encoded): - - >>> convert_options = csv.ConvertOptions( - ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: dictionary - n_legs: int64 - entry: timestamp[s] - fast: dictionary - ---- - animals: [ -- dictionary: - ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: - [0,1,2,3,4]] - n_legs: [[2,4,5,100,6]] - entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] - fast: [ -- dictionary: - ["Yes","No",""] -- indices: - [0,0,1,1,2]] - - Set upper limit for the number of categories. If the categories - is more than the limit, the conversion to dictionary will not - happen: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] - - Set empty strings to missing values: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["animals", "n_legs"], strings_can_be_null=True - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - animals: string - n_legs: int64 - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] - n_legs: [[2,4,5,100,6]] - - Define values to be True and False when converting a column - into a bool type: - - >>> convert_options = csv.ConvertOptions( - ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] - ... ) - >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) - pyarrow.Table - fast: bool - ---- - fast: [[true,true,false,false,null]] - """ - - check_utf8: bool = field(default=True, kw_only=False) - column_types: lib.Schema | dict | None = None - null_values: list[str] | None = None - true_values: list[str] | None = None - false_values: list[str] | None = None - decimal_point: str = "." - strings_can_be_null: bool = False - quoted_strings_can_be_null: bool = True - include_columns: list[str] | None = None - include_missing_columns: bool = False - auto_dict_encode: bool = False - auto_dict_max_cardinality: int | None = None - timestamp_parsers: list[str] | None = None - - def validate(self) -> None: ... - -@dataclass(kw_only=True) -class WriteOptions(lib._Weakrefable): - """ - Options for writing CSV files. - - Parameters - ---------- - include_header : bool, optional (default True) - Whether to write an initial header line with column names - batch_size : int, optional (default 1024) - How many rows to process together when converting and writing - CSV data - delimiter : 1-character string, optional (default ",") - The character delimiting individual cells in the CSV data. - quoting_style : str, optional (default "needed") - Whether to quote values, and if so, which quoting style to use. - The following values are accepted: - - - "needed" (default): only enclose values in quotes when needed. - - "all_valid": enclose all valid values in quotes; nulls are not quoted. - - "none": do not enclose any values in quotes; values containing - special characters (such as quotes, cell delimiters or line endings) - will raise an error. - """ - - include_header: bool = field(default=True, kw_only=False) - batch_size: int = 1024 - delimiter: str = "," - quoting_style: Literal["needed", "all_valid", "none"] = "needed" - - def validate(self) -> None: ... - -@dataclass -class InvalidRow(lib._Weakrefable): - """ - Description of an invalid row in a CSV file. - - Parameters - ---------- - expected_columns : int - The expected number of columns in the row. - actual_columns : int - The actual number of columns in the row. - number : int or None - The physical row number if known, otherwise None. - text : str - The contents of the row. - """ - - expected_columns: int - actual_columns: int - number: int | None - text: str - -class CSVWriter(lib._CRecordBatchWriter): - """ - Writer to create a CSV file. - - Parameters - ---------- - sink : str, path, pyarrow.OutputStream or file-like object - The location where to write the CSV data. - schema : pyarrow.Schema - The schema of the data to be written. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - """ - - def __init__( - self, - # TODO: OutputStream - sink: StrPath | IO[Any], - schema: lib.Schema, - write_options: WriteOptions | None = None, - *, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class CSVStreamingReader(lib.RecordBatchReader): ... - -ISO8601: lib._Weakrefable - -def open_csv( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - convert_options: ConvertOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> CSVStreamingReader: - """ - Open a streaming reader of CSV data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.csv.CSVStreamingReader` - """ - -def read_csv( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - convert_options: ConvertOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Table: - """ - Read a Table from a stream of CSV data. - - Parameters - ---------- - input_file : string, path or file-like object - The location of CSV data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.csv.ReadOptions, optional - Options for the CSV reader (see pyarrow.csv.ReadOptions constructor - for defaults) - parse_options : pyarrow.csv.ParseOptions, optional - Options for the CSV parser - (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options : pyarrow.csv.ConvertOptions, optional - Options for converting CSV data - (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate Table memory from - - Returns - ------- - :class:`pyarrow.Table` - Contents of the CSV file as a in-memory table. - - Examples - -------- - - Defining an example file from bytes object: - - >>> import io - >>> s = ( - ... "animals,n_legs,entry\\n" - ... "Flamingo,2,2022-03-01\\n" - ... "Horse,4,2022-03-02\\n" - ... "Brittle stars,5,2022-03-03\\n" - ... "Centipede,100,2022-03-04" - ... ) - >>> print(s) - animals,n_legs,entry - Flamingo,2,2022-03-01 - Horse,4,2022-03-02 - Brittle stars,5,2022-03-03 - Centipede,100,2022-03-04 - >>> source = io.BytesIO(s.encode()) - - Reading from the file - - >>> from pyarrow import csv - >>> csv.read_csv(source) - pyarrow.Table - animals: string - n_legs: int64 - entry: date32[day] - ---- - animals: [["Flamingo","Horse","Brittle stars","Centipede"]] - n_legs: [[2,4,5,100]] - entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] - """ - -def write_csv( - data: lib.RecordBatch | lib.Table, - output_file: StrPath | lib.NativeFile | IO[Any], - write_options: WriteOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> None: - """ - Write record batch or table to a CSV file. - - Parameters - ---------- - data : pyarrow.RecordBatch or pyarrow.Table - The data to write. - output_file : string, path, pyarrow.NativeFile, or file-like object - The location where to write the CSV data. - write_options : pyarrow.csv.WriteOptions - Options to configure writing the CSV data. - memory_pool : MemoryPool, optional - Pool for temporary allocations. - - Examples - -------- - - >>> import pyarrow as pa - >>> from pyarrow import csv - - >>> legs = pa.array([2, 4, 5, 100]) - >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) - >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) - >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) - - >>> csv.write_csv(table, "animals.csv") - - >>> write_options = csv.WriteOptions(include_header=False) - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - - >>> write_options = csv.WriteOptions(delimiter=";") - >>> csv.write_csv(table, "animals.csv", write_options=write_options) - """ diff --git a/pyarrow-stubs/_cuda.pyi b/pyarrow-stubs/_cuda.pyi deleted file mode 100644 index ad52b2f380f..00000000000 --- a/pyarrow-stubs/_cuda.pyi +++ /dev/null @@ -1,556 +0,0 @@ -from typing import Any - -import cuda # type: ignore[import-not-found] - -from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] - -from . import lib -from ._stubs_typing import ArrayLike - -class Context(lib._Weakrefable): - """ - CUDA driver context. - """ - - def __init__(self, device_number: int = 0, handle: int | None = None) -> None: - """ - Create a CUDA driver context for a particular device. - - If a CUDA context handle is passed, it is wrapped, otherwise - a default CUDA context for the given device is requested. - - Parameters - ---------- - device_number : int (default 0) - Specify the GPU device for which the CUDA driver context is - requested. - handle : int, optional - Specify CUDA handle for a shared context that has been created - by another library. - """ - @staticmethod - def from_numba(context: _numba_driver.Context | None = None) -> Context: - """ - Create a Context instance from a Numba CUDA context. - - Parameters - ---------- - context : {numba.cuda.cudadrv.driver.Context, None} - A Numba CUDA context instance. - If None, the current Numba context is used. - - Returns - ------- - shared_context : pyarrow.cuda.Context - Context instance. - """ - def to_numba(self) -> _numba_driver.Context: - """ - Convert Context to a Numba CUDA context. - - Returns - ------- - context : numba.cuda.cudadrv.driver.Context - Numba CUDA context instance. - """ - @staticmethod - def get_num_devices() -> int: - """Return the number of GPU devices.""" - @property - def device_number(self) -> int: - """Return context device number.""" - @property - def handle(self) -> int: - """Return pointer to context handle.""" - def synchronize(self) -> None: - """Blocks until the device has completed all preceding requested - tasks. - """ - @property - def bytes_allocated(self) -> int: - """Return the number of allocated bytes.""" - def get_device_address(self, address: int) -> int: - """Return the device address that is reachable from kernels running in - the context - - Parameters - ---------- - address : int - Specify memory address value - - Returns - ------- - device_address : int - Device address accessible from device context - - Notes - ----- - The device address is defined as a memory address accessible - by device. While it is often a device memory address but it - can be also a host memory address, for instance, when the - memory is allocated as host memory (using cudaMallocHost or - cudaHostAlloc) or as managed memory (using cudaMallocManaged) - or the host memory is page-locked (using cudaHostRegister). - """ - def new_buffer(self, nbytes: int) -> CudaBuffer: - """Return new device buffer. - - Parameters - ---------- - nbytes : int - Specify the number of bytes to be allocated. - - Returns - ------- - buf : CudaBuffer - Allocated buffer. - """ - @property - def memory_manager(self) -> lib.MemoryManager: - """ - The default memory manager tied to this context's device. - - Returns - ------- - MemoryManager - """ - @property - def device(self) -> lib.Device: - """ - The device instance associated with this context. - - Returns - ------- - Device - """ - def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: - """ - Create device buffer from address and size as a view. - - The caller is responsible for allocating and freeing the - memory. When `address==size==0` then a new zero-sized buffer - is returned. - - Parameters - ---------- - address : int - Specify the starting address of the buffer. The address can - refer to both device or host memory but it must be - accessible from device after mapping it with - `get_device_address` method. - size : int - Specify the size of device buffer in bytes. - base : {None, object} - Specify object that owns the referenced memory. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device reachable memory. - - """ - def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: - """Open existing CUDA IPC memory handle - - Parameters - ---------- - ipc_handle : IpcMemHandle - Specify opaque pointer to CUipcMemHandle (driver API). - - Returns - ------- - buf : CudaBuffer - referencing device buffer - """ - def buffer_from_data( - self, - data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, - offset: int = 0, - size: int = -1, - ) -> CudaBuffer: - """Create device buffer and initialize with data. - - Parameters - ---------- - data : {CudaBuffer, HostBuffer, Buffer, array-like} - Specify data to be copied to device buffer. - offset : int - Specify the offset of input buffer for device data - buffering. Default: 0. - size : int - Specify the size of device buffer in bytes. Default: all - (starting from input offset) - - Returns - ------- - cbuf : CudaBuffer - Device buffer with copied data. - """ - def buffer_from_object(self, obj: Any) -> CudaBuffer: - """Create device buffer view of arbitrary object that references - device accessible memory. - - When the object contains a non-contiguous view of device - accessible memory then the returned device buffer will contain - contiguous view of the memory, that is, including the - intermediate data that is otherwise invisible to the input - object. - - Parameters - ---------- - obj : {object, Buffer, HostBuffer, CudaBuffer, ...} - Specify an object that holds (device or host) address that - can be accessed from device. This includes objects with - types defined in pyarrow.cuda as well as arbitrary objects - that implement the CUDA array interface as defined by numba. - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of device accessible memory. - - """ - -class IpcMemHandle(lib._Weakrefable): - """A serializable container for a CUDA IPC handle.""" - @staticmethod - def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: - """Create IpcMemHandle from opaque buffer (e.g. from another - process) - - Parameters - ---------- - opaque_handle : - a CUipcMemHandle as a const void* - - Returns - ------- - ipc_handle : IpcMemHandle - """ - def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: - """Write IpcMemHandle to a Buffer - - Parameters - ---------- - pool : {MemoryPool, None} - Specify a pool to allocate memory from - - Returns - ------- - buf : Buffer - The serialized buffer. - """ - -class CudaBuffer(lib.Buffer): - """An Arrow buffer with data located in a GPU device. - - To create a CudaBuffer instance, use Context.device_buffer(). - - The memory allocated in a CudaBuffer is freed when the buffer object - is deleted. - """ - - @staticmethod - def from_buffer(buf: lib.Buffer) -> CudaBuffer: - """Convert back generic buffer into CudaBuffer - - Parameters - ---------- - buf : Buffer - Specify buffer containing CudaBuffer - - Returns - ------- - dbuf : CudaBuffer - Resulting device buffer. - """ - @staticmethod - def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: - """Create a CudaBuffer view from numba MemoryPointer instance. - - Parameters - ---------- - mem : numba.cuda.cudadrv.driver.MemoryPointer - - Returns - ------- - cbuf : CudaBuffer - Device buffer as a view of numba MemoryPointer. - """ - def to_numba(self) -> _numba_driver.MemoryPointer: - """Return numba memory pointer of CudaBuffer instance.""" - def copy_to_host( - self, - position: int = 0, - nbytes: int = -1, - buf: lib.Buffer | None = None, - memory_pool: lib.MemoryPool | None = None, - resizable: bool = False, - ) -> lib.Buffer: - """Copy memory from GPU device to CPU host - - Caller is responsible for ensuring that all tasks affecting - the memory are finished. Use - - `.context.synchronize()` - - when needed. - - Parameters - ---------- - position : int - Specify the starting position of the source data in GPU - device buffer. Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - the position until host buffer is full). - buf : Buffer - Specify a pre-allocated output buffer in host. Default: None - (allocate new output buffer). - memory_pool : MemoryPool - resizable : bool - Specify extra arguments to allocate_buffer. Used only when - buf is None. - - Returns - ------- - buf : Buffer - Output buffer in host. - - """ - def copy_from_host( - self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 - ) -> int: - """Copy data from host to device. - - The device buffer must be pre-allocated. - - Parameters - ---------- - data : {Buffer, array-like} - Specify data in host. It can be array-like that is valid - argument to py_buffer - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - """ - def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: - """Copy data from device to device. - - Parameters - ---------- - buf : CudaBuffer - Specify source device buffer. - position : int - Specify the starting position of the copy in device buffer. - Default: 0. - nbytes : int - Specify the number of bytes to copy. Default: -1 (all from - source until device buffer, starting from position, is full) - - Returns - ------- - nbytes : int - Number of bytes copied. - - """ - def export_for_ipc(self) -> IpcMemHandle: - """ - Expose this device buffer as IPC memory which can be used in other - processes. - - After calling this function, this device memory will not be - freed when the CudaBuffer is destructed. - - Returns - ------- - ipc_handle : IpcMemHandle - The exported IPC handle - - """ - @property - def context(self) -> Context: - """Returns the CUDA driver context of this buffer.""" - def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: - """Return slice of device buffer - - Parameters - ---------- - offset : int, default 0 - Specify offset from the start of device buffer to slice - length : int, default None - Specify the length of slice (default is until end of device - buffer starting from offset). If the length is larger than - the data available, the returned slice will have a size of - the available data starting from the offset. - - Returns - ------- - sliced : CudaBuffer - Zero-copy slice of device buffer. - - """ - def to_pybytes(self) -> bytes: - """Return device buffer content as Python bytes.""" - -class HostBuffer(lib.Buffer): - """Device-accessible CPU memory created using cudaHostAlloc. - - To create a HostBuffer instance, use - - cuda.new_host_buffer() - """ - @property - def size(self) -> int: ... - -class BufferReader(lib.NativeFile): - """File interface for zero-copy read from CUDA buffers. - - Note: Read methods return pointers to device memory. This means - you must be careful using this interface with any Arrow code which - may expect to be able to do anything other than pointer arithmetic - on the returned buffers. - """ - def __init__(self, obj: CudaBuffer) -> None: ... - def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: - """Return a slice view of the underlying device buffer. - - The slice will start at the current reader position and will - have specified size in bytes. - - Parameters - ---------- - nbytes : int, default None - Specify the number of bytes to read. Default: None (read all - remaining bytes). - - Returns - ------- - cbuf : CudaBuffer - New device buffer. - - """ - -class BufferWriter(lib.NativeFile): - """File interface for writing to CUDA buffers. - - By default writes are unbuffered. Use set_buffer_size to enable - buffering. - """ - def __init__(self, obj: CudaBuffer) -> None: ... - def writeat(self, position: int, data: ArrayLike) -> None: - """Write data to buffer starting from position. - - Parameters - ---------- - position : int - Specify device buffer position where the data will be - written. - data : array-like - Specify data, the data instance must implement buffer - protocol. - """ - @property - def buffer_size(self) -> int: - """Returns size of host (CPU) buffer, 0 for unbuffered""" - @buffer_size.setter - def buffer_size(self, buffer_size: int): - """Set CPU buffer size to limit calls to cudaMemcpy - - Parameters - ---------- - buffer_size : int - Specify the size of CPU buffer to allocate in bytes. - """ - @property - def num_bytes_buffered(self) -> int: - """Returns number of bytes buffered on host""" - -def new_host_buffer(size: int, device: int = 0) -> HostBuffer: - """Return buffer with CUDA-accessible memory on CPU host - - Parameters - ---------- - size : int - Specify the number of bytes to be allocated. - device : int - Specify GPU device number. - - Returns - ------- - dbuf : HostBuffer - Allocated host buffer - """ - -def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: - """Write record batch message to GPU device memory - - Parameters - ---------- - batch : RecordBatch - Record batch to write - ctx : Context - CUDA Context to allocate device memory from - - Returns - ------- - dbuf : CudaBuffer - device buffer which contains the record batch message - """ - -def read_message( - source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None -) -> lib.Message: - """Read Arrow IPC message located on GPU device - - Parameters - ---------- - source : {CudaBuffer, cuda.BufferReader} - Device buffer or reader of device buffer. - pool : MemoryPool (optional) - Pool to allocate CPU memory for the metadata - - Returns - ------- - message : Message - The deserialized message, body still on device - """ - -def read_record_batch( - buffer: lib.Buffer, - object: lib.Schema, - *, - dictionary_memo: lib.DictionaryMemo | None = None, - pool: lib.MemoryPool | None = None, -) -> lib.RecordBatch: - """Construct RecordBatch referencing IPC message located on CUDA device. - - While the metadata is copied to host memory for deserialization, - the record batch data remains on the device. - - Parameters - ---------- - buffer : - Device buffer containing the complete IPC message - schema : Schema - The schema for the record batch - dictionary_memo : DictionaryMemo, optional - If message contains dictionaries, must pass a populated - DictionaryMemo - pool : MemoryPool (optional) - Pool to allocate metadata from - - Returns - ------- - batch : RecordBatch - Reconstructed record batch, with device pointers - - """ diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi deleted file mode 100644 index af864f9154b..00000000000 --- a/pyarrow-stubs/_dataset.pyi +++ /dev/null @@ -1,2299 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import ( - IO, - Any, - Callable, - Generic, - Iterator, - Literal, - NamedTuple, - TypeVar, - overload, -) - -from _typeshed import StrPath - -from . import _csv, _json, _parquet, lib -from ._fs import FileSelector, FileSystem, SupportedFileSystem -from ._stubs_typing import Indices, JoinType, Order -from .acero import ExecNodeOptions -from .compute import Expression -from .ipc import IpcWriteOptions, RecordBatchReader - -class Dataset(lib._Weakrefable): - """ - Collection of data fragments and potentially child datasets. - - Arrow Datasets allow you to query against data that has been split across - multiple files. This sharding of data may indicate partitioning, which - can accelerate queries that only touch some partitions (files). - """ - - @property - def partition_expression(self) -> Expression: - """ - An Expression which evaluates to true for all data viewed by this - Dataset. - """ - def replace_schema(self, schema: lib.Schema) -> None: - """ - Return a copy of this Dataset with a different schema. - - The copy will view the same Fragments. If the new schema is not - compatible with the original dataset's schema then an error will - be raised. - - Parameters - ---------- - schema : Schema - The new dataset schema. - """ - def get_fragments(self, filter: Expression | None = None): - """Returns an iterator over the fragments in this dataset. - - Parameters - ---------- - filter : Expression, default None - Return fragments matching the optional filter, either using the - partition_expression or internal information like Parquet's - statistics. - - Returns - ------- - fragments : iterator of Fragment - """ - def scanner( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the dataset. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - See the :meth:`Scanner.from_dataset` method for further information. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "dataset_scanner.parquet") - - >>> import pyarrow.dataset as ds - >>> dataset = ds.dataset("dataset_scanner.parquet") - - Selecting a subset of the columns: - - >>> dataset.scanner(columns=["year", "n_legs"]).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - ---- - year: [[2020,2022,2021,2022,2019,2021]] - n_legs: [[2,2,4,4,5,100]] - - Projecting selected columns using an expression: - - >>> dataset.scanner( - ... columns={ - ... "n_legs_uint": ds.field("n_legs").cast("uint8"), - ... } - ... ).to_table() - pyarrow.Table - n_legs_uint: uint8 - ---- - n_legs_uint: [[2,2,4,4,5,100]] - - Filtering rows while scanning: - - >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() - pyarrow.Table - year: int64 - n_legs: int64 - animal: string - ---- - year: [[2022,2021,2022,2021]] - n_legs: [[2,4,4,100]] - animal: [["Parrot","Dog","Horse","Centipede"]] - """ - def to_batches( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the dataset as materialized record batches. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def to_table( - self, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Read the dataset to an Arrow table. - - Note that this method reads all the selected data from the dataset - into memory. - - Parameters - ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def take( - self, - indices: Indices, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def head( - self, - num_rows: int, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def count_rows( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ - @property - def schema(self) -> lib.Schema: - """The common schema of the full Dataset""" - def filter(self, expression: Expression) -> Self: - """ - Apply a row filter to the dataset. - - Parameters - ---------- - expression : Expression - The filter that should be applied to the dataset. - - Returns - ------- - Dataset - """ - def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: - """ - Sort the Dataset by one or multiple columns. - - Parameters - ---------- - sorting : str or list[tuple(name, order)] - Name of the column to use to sort (ascending), or - a list of multiple sorting conditions where - each entry is a tuple with column name - and sorting order ("ascending" or "descending") - **kwargs : dict, optional - Additional sorting options. - As allowed by :class:`SortOptions` - - Returns - ------- - InMemoryDataset - A new dataset sorted according to the sort keys. - """ - def join( - self, - right_dataset: Dataset, - keys: str | list[str], - right_keys: str | list[str] | None = None, - join_type: JoinType = "left outer", - left_suffix: str | None = None, - right_suffix: str | None = None, - coalesce_keys: bool = True, - use_threads: bool = True, - ) -> InMemoryDataset: - """ - Perform a join between this dataset and another one. - - Result of the join will be a new dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - keys : str or list[str] - The columns from current dataset that should be used as keys - of the join operation left side. - right_keys : str or list[str], default None - The columns from the right_dataset that should be used as keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - join_type : str, default "left outer" - The kind of join that should be performed, one of - ("left semi", "right semi", "left anti", "right anti", - "inner", "left outer", "right outer", "full outer") - left_suffix : str, default None - Which suffix to add to right column names. This prevents confusion - when the columns in left and right datasets have colliding names. - right_suffix : str, default None - Which suffix to add to the left column names. This prevents confusion - when the columns in left and right datasets have colliding names. - coalesce_keys : bool, default True - If the duplicated keys should be omitted from one of the sides - in the join result. - use_threads : bool, default True - Whenever to use multithreading or not. - - Returns - ------- - InMemoryDataset - """ - def join_asof( - self, - right_dataset: Dataset, - on: str, - by: str | list[str], - tolerance: int, - right_on: str | list[str] | None = None, - right_by: str | list[str] | None = None, - ) -> InMemoryDataset: - """ - Perform an asof join between this dataset and another one. - - This is similar to a left-join except that we match on nearest key rather - than equal keys. Both datasets must be sorted by the key. This type of join - is most useful for time series data that are not perfectly aligned. - - Optionally match on equivalent keys with "by" before searching with "on". - - Result of the join will be a new Dataset, where further - operations can be applied. - - Parameters - ---------- - right_dataset : dataset - The dataset to join to the current one, acting as the right dataset - in the join operation. - on : str - The column from current dataset that should be used as the "on" key - of the join operation left side. - - An inexact match is used on the "on" key, i.e. a row is considered a - match if and only if left_on - tolerance <= right_on <= left_on. - - The input table must be sorted by the "on" key. Must be a single - field of a common type. - - Currently, the "on" key must be an integer, date, or timestamp type. - by : str or list[str] - The columns from current dataset that should be used as the keys - of the join operation left side. The join operation is then done - only for the matches in these columns. - tolerance : int - The tolerance for inexact "on" key matching. A right row is considered - a match with the left row `right.on - left.on <= tolerance`. The - `tolerance` may be: - - - negative, in which case a past-as-of-join occurs; - - or positive, in which case a future-as-of-join occurs; - - or zero, in which case an exact-as-of-join occurs. - - The tolerance is interpreted in the same units as the "on" key. - right_on : str or list[str], default None - The columns from the right_dataset that should be used as the on key - on the join operation right side. - When ``None`` use the same key name as the left dataset. - right_by : str or list[str], default None - The columns from the right_dataset that should be used as by keys - on the join operation right side. - When ``None`` use the same key names as the left dataset. - - Returns - ------- - InMemoryDataset - """ - -class InMemoryDataset(Dataset): - """ - A Dataset wrapping in-memory data. - - Parameters - ---------- - source : RecordBatch, Table, list, tuple - The data for this dataset. Can be a RecordBatch, Table, list of - RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader - If an iterable is provided, the schema must also be provided. - schema : Schema, optional - Only required if passing an iterable as the source - """ - -class UnionDataset(Dataset): - """ - A Dataset wrapping child datasets. - - Children's schemas must agree with the provided schema. - - Parameters - ---------- - schema : Schema - A known schema to conform to. - children : list of Dataset - One or more input children - """ - - @property - def children(self) -> list[Dataset]: ... - -class FileSystemDataset(Dataset): - """ - A Dataset of file fragments. - - A FileSystemDataset is composed of one or more FileFragment. - - Parameters - ---------- - fragments : list[Fragments] - List of fragments to consume. - schema : Schema - The top-level schema of the Dataset. - format : FileFormat - File format of the fragments, currently only ParquetFileFormat, - IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - FileSystem of the fragments. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ - - def __init__( - self, - fragments: list[Fragment], - schema: lib.Schema, - format: FileFormat, - filesystem: SupportedFileSystem | None = None, - root_partition: Expression | None = None, - ) -> None: ... - @classmethod - def from_paths( - cls, - paths: list[str], - schema: lib.Schema | None = None, - format: FileFormat | None = None, - filesystem: SupportedFileSystem | None = None, - partitions: list[Expression] | None = None, - root_partition: Expression | None = None, - ) -> FileSystemDataset: - """ - A Dataset created from a list of paths on a particular filesystem. - - Parameters - ---------- - paths : list of str - List of file paths to create the fragments from. - schema : Schema - The top-level schema of the DataDataset. - format : FileFormat - File format to create fragments from, currently only - ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. - filesystem : FileSystem - The filesystem which files are from. - partitions : list[Expression], optional - Attach additional partition information for the file paths. - root_partition : Expression, optional - The top-level partition of the DataDataset. - """ - @property - def filesystem(self) -> FileSystem: ... - @property - def partitioning(self) -> Partitioning | None: - """ - The partitioning of the Dataset source, if discovered. - - If the FileSystemDataset is created using the ``dataset()`` factory - function with a partitioning specified, this will return the - finalized Partitioning object from the dataset discovery. In all - other cases, this returns None. - """ - @property - def files(self) -> list[str]: - """List of the files""" - @property - def format(self) -> FileFormat: - """The FileFormat of this source.""" - -class FileWriteOptions(lib._Weakrefable): - @property - def format(self) -> FileFormat: ... - -class FileFormat(lib._Weakrefable): - def inspect( - self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None - ) -> lib.Schema: - """ - Infer the schema of a file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to infer a schema from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - - Returns - ------- - schema : Schema - The schema inferred from the file - """ - def make_fragment( - self, - file: StrPath | IO, - filesystem: SupportedFileSystem | None = None, - partition_expression: Expression | None = None, - *, - file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ - def make_write_options(self) -> FileWriteOptions: ... - @property - def default_extname(self) -> str: ... - @property - def default_fragment_scan_options(self) -> FragmentScanOptions: ... - @default_fragment_scan_options.setter - def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... - -class Fragment(lib._Weakrefable): - """Fragment of data from a Dataset.""" - @property - def physical_schema(self) -> lib.Schema: - """Return the physical schema of this Fragment. This schema can be - different from the dataset read schema.""" - @property - def partition_expression(self) -> Expression: - """An Expression which evaluates to true for all data viewed by this - Fragment. - """ - def scanner( - self, - schema: lib.Schema | None = None, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Build a scan operation against the fragment. - - Data is not loaded immediately. Instead, this produces a Scanner, - which exposes further operations (e.g. loading all data as a - table, counting rows). - - Parameters - ---------- - schema : Schema - Schema to use for scanning. This is used to unify a Fragment to - its Dataset's schema. If not specified this will use the - Fragment's physical schema which might differ for each Fragment. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - scanner : Scanner - """ - def to_batches( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Iterator[lib.RecordBatch]: - """ - Read the fragment as materialized record batches. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def to_table( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Convert this Fragment into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Parameters - ---------- - schema : Schema, optional - Concrete schema to use for scanning. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - table : Table - """ - def take( - self, - indices: Indices, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Select rows of data by index. - - Parameters - ---------- - indices : Array or array-like - The indices of row to select in the dataset. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ - def head( - self, - num_rows: int, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> lib.Table: - """ - Load the first N rows of the fragment. - - Parameters - ---------- - num_rows : int - The number of rows to load. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - Table - """ - def count_rows( - self, - columns: list[str] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> int: - """ - Count rows matching the scanner filter. - - Parameters - ---------- - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - - Returns - ------- - count : int - """ - -class FileFragment(Fragment): - """A Fragment representing a data file.""" - - def open(self) -> lib.NativeFile: - """ - Open a NativeFile of the buffer or file viewed by this fragment. - """ - @property - def path(self) -> str: - """ - The path of the data file viewed by this fragment, if it views a - file. If instead it views a buffer, this will be "". - """ - @property - def filesystem(self) -> FileSystem: - """ - The FileSystem containing the data file viewed by this fragment, if - it views a file. If instead it views a buffer, this will be None. - """ - @property - def buffer(self) -> lib.Buffer: - """ - The buffer viewed by this fragment, if it views a buffer. If - instead it views a file, this will be None. - """ - @property - def format(self) -> FileFormat: - """ - The format of the data file viewed by this fragment. - """ - -class FragmentScanOptions(lib._Weakrefable): - """Scan options specific to a particular fragment and scan operation.""" - - @property - def type_name(self) -> str: ... - -class IpcFileWriteOptions(FileWriteOptions): - @property - def write_options(self) -> IpcWriteOptions: ... - @write_options.setter - def write_options(self, write_options: IpcWriteOptions) -> None: ... - -class IpcFileFormat(FileFormat): - def equals(self, other: IpcFileFormat) -> bool: ... - def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... - @property - def default_extname(self) -> str: ... - -class FeatherFileFormat(IpcFileFormat): ... - -class CsvFileFormat(FileFormat): - """ - FileFormat for CSV files. - - Parameters - ---------- - parse_options : pyarrow.csv.ParseOptions - Options regarding CSV parsing. - default_fragment_scan_options : CsvFragmentScanOptions - Default options for fragments scan. - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ - def __init__( - self, - parse_options: _csv.ParseOptions | None = None, - default_fragment_scan_options: CsvFragmentScanOptions | None = None, - convert_options: _csv.ConvertOptions | None = None, - read_options: _csv.ReadOptions | None = None, - ) -> None: ... - def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] - @property - def parse_options(self) -> _csv.ParseOptions: ... - @parse_options.setter - def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... - def equals(self, other: CsvFileFormat) -> bool: ... - -class CsvFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for CSV fragments. - - Parameters - ---------- - convert_options : pyarrow.csv.ConvertOptions - Options regarding value conversion. - read_options : pyarrow.csv.ReadOptions - General read options. - """ - - convert_options: _csv.ConvertOptions - read_options: _csv.ReadOptions - - def __init__( - self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions - ) -> None: ... - def equals(self, other: CsvFragmentScanOptions) -> bool: ... - -class CsvFileWriteOptions(FileWriteOptions): - write_options: _csv.WriteOptions - -class JsonFileFormat(FileFormat): - """ - FileFormat for JSON files. - - Parameters - ---------- - default_fragment_scan_options : JsonFragmentScanOptions - Default options for fragments scan. - parse_options : pyarrow.json.ParseOptions - Options regarding json parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ - def __init__( - self, - default_fragment_scan_options: JsonFragmentScanOptions | None = None, - parse_options: _json.ParseOptions | None = None, - read_options: _json.ReadOptions | None = None, - ) -> None: ... - def equals(self, other: JsonFileFormat) -> bool: ... - -class JsonFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for JSON fragments. - - Parameters - ---------- - parse_options : pyarrow.json.ParseOptions - Options regarding JSON parsing. - read_options : pyarrow.json.ReadOptions - General read options. - """ - - parse_options: _json.ParseOptions - read_options: _json.ReadOptions - def __init__( - self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions - ) -> None: ... - def equals(self, other: JsonFragmentScanOptions) -> bool: ... - -class Partitioning(lib._Weakrefable): - def parse(self, path: str) -> Expression: - """ - Parse a path into a partition expression. - - Parameters - ---------- - path : str - - Returns - ------- - pyarrow.dataset.Expression - """ - def format(self, expr: Expression) -> tuple[str, str]: - """ - Convert a filter expression into a tuple of (directory, filename) using - the current partitioning scheme - - Parameters - ---------- - expr : pyarrow.dataset.Expression - - Returns - ------- - tuple[str, str] - - Examples - -------- - - Specify the Schema for paths like "/2009/June": - - >>> import pyarrow as pa - >>> import pyarrow.dataset as ds - >>> import pyarrow.compute as pc - >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) - >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) - ('1862/Jan', '') - """ - @property - def schema(self) -> lib.Schema: - """The arrow Schema attached to the partitioning.""" - -class PartitioningFactory(lib._Weakrefable): - @property - def type_name(self) -> str: ... - -class KeyValuePartitioning(Partitioning): - @property - def dictionaries(self) -> list[lib.Array | None]: - """ - The unique values for each partition field, if available. - - Those values are only available if the Partitioning object was - created through dataset discovery from a PartitioningFactory, or - if the dictionaries were manually specified in the constructor. - If no dictionary field is available, this returns an empty list. - """ - -class DirectoryPartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The DirectoryPartitioning expects one segment in the file path for each - field in the schema (all fields are required to be present). - For example given schema the path "/2009/11" would - be parsed to ("year"_ == 2009 and "month"_ == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - DirectoryPartitioning - - Examples - -------- - >>> from pyarrow.dataset import DirectoryPartitioning - >>> partitioning = DirectoryPartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("/2009/11/")) - ((year == 2009) and (month == 11)) - """ - - @staticmethod - def discover( - field_names: list[str] | None = None, - infer_dictionary: bool = False, - max_partition_dictionary_size: int = 0, - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a DirectoryPartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - -class HivePartitioning(KeyValuePartitioning): - """ - A Partitioning for "/$key=$value/" nested directories as found in - Apache Hive. - - Multi-level, directory based partitioning scheme originating from - Apache Hive with all data files stored in the leaf directories. Data is - partitioned by static values of a particular column in the schema. - Partition keys are represented in the form $key=$value in directory names. - Field order is ignored, as are missing or unrecognized field names. - - For example, given schema, a possible - path would be "/year=2009/month=11/day=15". - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - If any field is None then this fallback will be used as a label - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - HivePartitioning - - Examples - -------- - >>> from pyarrow.dataset import HivePartitioning - >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) - >>> print(partitioning.parse("/year=2009/month=11/")) - ((year == 2009) and (month == 11)) - - """ - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - null_fallback: str = "__HIVE_DEFAULT_PARTITION__", - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - @staticmethod - def discover( - infer_dictionary: bool = False, - max_partition_dictionary_size: int = 0, - null_fallback="__HIVE_DEFAULT_PARTITION__", - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a HivePartitioning. - - Parameters - ---------- - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain. This can be more efficient when - materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - max_partition_dictionary_size : int, default 0 - Synonymous with infer_dictionary for backwards compatibility with - 1.0: setting this to -1 or None is equivalent to passing - infer_dictionary=True. - null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" - When inferring a schema for partition fields this value will be - replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ - for compatibility with Spark - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - -class FilenamePartitioning(KeyValuePartitioning): - """ - A Partitioning based on a specified Schema. - - The FilenamePartitioning expects one segment in the file name for each - field in the schema (all fields are required to be present) separated - by '_'. For example given schema the name - ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). - - Parameters - ---------- - schema : Schema - The schema that describes the partitions present in the file path. - dictionaries : dict[str, Array] - If the type of any field of `schema` is a dictionary type, the - corresponding entry of `dictionaries` must be an array containing - every value which may be taken by the corresponding column or an - error will be raised in parsing. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - FilenamePartitioning - - Examples - -------- - >>> from pyarrow.dataset import FilenamePartitioning - >>> partitioning = FilenamePartitioning( - ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) - ... ) - >>> print(partitioning.parse("2009_11_data.parquet")) - ((year == 2009) and (month == 11)) - """ - - def __init__( - self, - schema: lib.Schema, - dictionaries: dict[str, lib.Array] | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> None: ... - @staticmethod - def discover( - field_names: list[str] | None = None, - infer_dictionary: bool = False, - schema: lib.Schema | None = None, - segment_encoding: Literal["uri", "none"] = "uri", - ) -> PartitioningFactory: - """ - Discover a FilenamePartitioning. - - Parameters - ---------- - field_names : list of str - The names to associate with the values from the subdirectory names. - If schema is given, will be populated from the schema. - infer_dictionary : bool, default False - When inferring a schema for partition fields, yield dictionary - encoded types instead of plain types. This can be more efficient - when materializing virtual columns, and Expressions parsed by the - finished Partitioning will include dictionaries of all unique - inspected values for each field. - schema : Schema, default None - Use this schema instead of inferring a schema from partition - values. Partition values will be validated against this schema - before accumulation into the Partitioning's dictionary. - segment_encoding : str, default "uri" - After splitting paths into segments, decode the segments. Valid - values are "uri" (URI-decode segments) and "none" (leave as-is). - - Returns - ------- - PartitioningFactory - To be used in the FileSystemFactoryOptions. - """ - -class DatasetFactory(lib._Weakrefable): - """ - DatasetFactory is used to create a Dataset, inspect the Schema - of the fragments contained in it, and declare a partitioning. - """ - - root_partition: Expression - def finish(self, schema: lib.Schema | None = None) -> Dataset: - """ - Create a Dataset using the inspected schema or an explicit schema - (if given). - - Parameters - ---------- - schema : Schema, default None - The schema to conform the source to. If None, the inspected - schema is used. - - Returns - ------- - Dataset - """ - def inspect(self) -> lib.Schema: - """ - Inspect all data fragments and return a common Schema. - - Returns - ------- - Schema - """ - def inspect_schemas(self) -> list[lib.Schema]: ... - -class FileSystemFactoryOptions(lib._Weakrefable): - """ - Influences the discovery of filesystem paths. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning/PartitioningFactory, optional - Apply the Partitioning to every discovered Fragment. See Partitioning or - PartitioningFactory documentation. - exclude_invalid_files : bool, optional (default True) - If True, invalid files will be excluded (file format specific check). - This will incur IO for each files in a serial and single threaded - fashion. Disabling this feature will skip the IO, but unsupported - files may be present in the Dataset (resulting in an error at scan - time). - selector_ignore_prefixes : list, optional - When discovering from a Selector (and not from an explicit file list), - ignore files and directories matching any of these prefixes. - By default this is ['.', '_']. - """ - - partitioning: Partitioning - partitioning_factory: PartitioningFactory - partition_base_dir: str - exclude_invalid_files: bool - selector_ignore_prefixes: list[str] - - def __init__( - self, - artition_base_dir: str | None = None, - partitioning: Partitioning | PartitioningFactory | None = None, - exclude_invalid_files: bool = True, - selector_ignore_prefixes: list[str] | None = None, - ) -> None: ... - -class FileSystemDatasetFactory(DatasetFactory): - """ - Create a DatasetFactory from a list of paths with schema inspection. - - Parameters - ---------- - filesystem : pyarrow.fs.FileSystem - Filesystem to discover. - paths_or_selector : pyarrow.fs.FileSelector or list of path-likes - Either a Selector object or a list of path-like objects. - format : FileFormat - Currently only ParquetFileFormat and IpcFileFormat are supported. - options : FileSystemFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ - - def __init__( - self, - filesystem: SupportedFileSystem, - paths_or_selector: FileSelector, - format: FileFormat, - options: FileSystemFactoryOptions | None = None, - ) -> None: ... - -class UnionDatasetFactory(DatasetFactory): - """ - Provides a way to inspect/discover a Dataset's expected schema before - materialization. - - Parameters - ---------- - factories : list of DatasetFactory - """ - def __init__(self, factories: list[DatasetFactory]) -> None: ... - -_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) - -class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): - """An iterator over a sequence of record batches.""" - def __iter__(self) -> Self: ... - def __next__(self) -> _RecordBatchT: ... - -class TaggedRecordBatch(NamedTuple): - """ - A combination of a record batch and the fragment it came from. - - Parameters - ---------- - record_batch : RecordBatch - The record batch. - fragment : Fragment - Fragment of the record batch. - """ - - record_batch: lib.RecordBatch - fragment: Fragment - -class TaggedRecordBatchIterator(lib._Weakrefable): - """An iterator over a sequence of record batches with fragments.""" - def __iter__(self) -> Self: ... - def __next__(self) -> TaggedRecordBatch: ... - -class Scanner(lib._Weakrefable): - """A materialized scan operation with context and options bound. - - A scanner is the class that glues the scan tasks, data fragments and data - sources together. - """ - @staticmethod - def from_dataset( - dataset: Dataset, - *, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Dataset, - - Parameters - ---------- - dataset : Dataset - Dataset to scan. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @staticmethod - def from_fragment( - fragment: Fragment, - *, - schema: lib.Schema | None = None, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: - """ - Create Scanner from Fragment, - - Parameters - ---------- - fragment : Fragment - fragment to scan. - schema : Schema, optional - The schema of the fragment. - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @overload - @staticmethod - def from_batches( - source: Iterator[lib.RecordBatch], - *, - schema: lib.Schema, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... - @overload - @staticmethod - def from_batches( - source: RecordBatchReader, - *, - columns: list[str] | dict[str, Expression] | None = None, - filter: Expression | None = None, - batch_size: int = ..., - batch_readahead: int = 16, - fragment_readahead: int = 4, - fragment_scan_options: FragmentScanOptions | None = None, - use_threads: bool = True, - cache_metadata: bool = True, - memory_pool: lib.MemoryPool | None = None, - ) -> Scanner: ... - @staticmethod - def from_batches(*args, **kwargs): - """ - Create a Scanner from an iterator of batches. - - This creates a scanner which can be used only once. It is - intended to support writing a dataset (which takes a scanner) - from a source which can be read only once (e.g. a - RecordBatchReader or generator). - - Parameters - ---------- - source : Iterator or Arrow-compatible stream object - The iterator of Batches. This can be a pyarrow RecordBatchReader, - any object that implements the Arrow PyCapsule Protocol for - streams, or an actual Python iterator of RecordBatches. - schema : Schema - The schema of the batches (required when passing a Python - iterator). - columns : list[str] or dict[str, Expression], default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - - The list of columns or expressions may use the special fields - `__batch_index` (the index of the batch within the fragment), - `__fragment_index` (the index of the fragment within the dataset), - `__last_in_fragment` (whether the batch is last in fragment), and - `__filename` (the name of the source file or a description of the - source fragment). - - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 131_072 - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_readahead : int, default 4 - The number of files to read ahead. Increasing this number will increase - RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - cache_metadata : bool, default True - If enabled, metadata may be cached when scanning to speed up - repeated scans. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - """ - @property - def dataset_schema(self) -> lib.Schema: - """The schema with which batches will be read from fragments.""" - @property - def projected_schema(self) -> lib.Schema: - """ - The materialized schema of the data, accounting for projections. - - This is the schema of any data returned from the scanner. - """ - def to_batches(self) -> Iterator[lib.RecordBatch]: - """ - Consume a Scanner in record batches. - - Returns - ------- - record_batches : iterator of RecordBatch - """ - def scan_batches(self) -> TaggedRecordBatchIterator: - """ - Consume a Scanner in record batches with corresponding fragments. - - Returns - ------- - record_batches : iterator of TaggedRecordBatch - """ - def to_table(self) -> lib.Table: - """ - Convert a Scanner into a Table. - - Use this convenience utility with care. This will serially materialize - the Scan result in memory before creating the Table. - - Returns - ------- - Table - """ - def take(self, indices: Indices) -> lib.Table: - """ - Select rows of data by index. - - Will only consume as many batches of the underlying dataset as - needed. Otherwise, this is equivalent to - ``to_table().take(indices)``. - - Parameters - ---------- - indices : Array or array-like - indices of rows to select in the dataset. - - Returns - ------- - Table - """ - def head(self, num_rows: int) -> lib.Table: - """ - Load the first N rows of the dataset. - - Parameters - ---------- - num_rows : int - The number of rows to load. - - Returns - ------- - Table - """ - def count_rows(self) -> int: - """ - Count rows matching the scanner filter. - - Returns - ------- - count : int - """ - def to_reader(self) -> RecordBatchReader: - """Consume this scanner as a RecordBatchReader. - - Returns - ------- - RecordBatchReader - """ - -def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: - """ - Extract partition keys (equality constraints between a field and a scalar) - from an expression as a dict mapping the field's name to its value. - - NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning - will be conjunctions of equality conditions and are accessible through this - function. Other subexpressions will be ignored. - - Parameters - ---------- - partition_expression : pyarrow.dataset.Expression - - Returns - ------- - dict - - Examples - -------- - - For example, an expression of - - is converted to {'part': 'A', 'year': 2016} - """ - -class WrittenFile(lib._Weakrefable): - """ - Metadata information about files written as - part of a dataset write operation - - Parameters - ---------- - path : str - Path to the file. - metadata : pyarrow.parquet.FileMetaData, optional - For Parquet files, the Parquet file metadata. - size : int - The size of the file in bytes. - """ - def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... - -def _filesystemdataset_write( - data: Scanner, - base_dir: StrPath, - basename_template: str, - filesystem: SupportedFileSystem, - partitioning: Partitioning, - file_options: FileWriteOptions, - max_partitions: int, - file_visitor: Callable[[str], None], - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], - max_open_files: int, - max_rows_per_file: int, - min_rows_per_group: int, - max_rows_per_group: int, - create_dir: bool, -): ... - -class _ScanNodeOptions(ExecNodeOptions): - def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... - -class ScanNodeOptions(_ScanNodeOptions): - """ - A Source node which yields batches from a Dataset scan. - - This is the option class for the "scan" node factory. - - This node is capable of applying pushdown projections or filters - to the file readers which reduce the amount of data that needs to - be read (if supported by the file format). But note that this does not - construct associated filter or project nodes to perform the final - filtering or projection. Rather, you may supply the same filter - expression or projection to the scan node that you also supply - to the filter or project node. - - Yielded batches will be augmented with fragment/batch indices when - implicit_ordering=True to enable stable ordering for simple ExecPlans. - - Parameters - ---------- - dataset : pyarrow.dataset.Dataset - The table which acts as the data source. - **kwargs : dict, optional - Scan options. See `Scanner.from_dataset` for possible arguments. - require_sequenced_output : bool, default False - Batches are yielded sequentially, like single-threaded - implicit_ordering : bool, default False - Preserve implicit ordering of data. - """ - - def __init__( - self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs - ) -> None: ... diff --git a/pyarrow-stubs/_dataset_orc.pyi b/pyarrow-stubs/_dataset_orc.pyi deleted file mode 100644 index 9c4ac04198f..00000000000 --- a/pyarrow-stubs/_dataset_orc.pyi +++ /dev/null @@ -1,6 +0,0 @@ -from ._dataset import FileFormat - -class OrcFileFormat(FileFormat): - def equals(self, other: OrcFileFormat) -> bool: ... - @property - def default_extname(self): ... diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi deleted file mode 100644 index cbcc17235f1..00000000000 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ /dev/null @@ -1,314 +0,0 @@ -from dataclasses import dataclass -from typing import IO, Any, Iterable, TypedDict - -from _typeshed import StrPath - -from ._compute import Expression -from ._dataset import ( - DatasetFactory, - FileFormat, - FileFragment, - FileWriteOptions, - Fragment, - FragmentScanOptions, - Partitioning, - PartitioningFactory, -) -from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import SupportedFileSystem -from ._parquet import FileDecryptionProperties, FileMetaData -from .lib import CacheOptions, Schema, _Weakrefable - -parquet_encryption_enabled: bool - -class ParquetFileFormat(FileFormat): - """ - FileFormat for Parquet - - Parameters - ---------- - read_options : ParquetReadOptions - Read options for the file. - default_fragment_scan_options : ParquetFragmentScanOptions - Scan Options for the file. - **kwargs : dict - Additional options for read option or scan option - """ - def __init__( - self, - read_options: ParquetReadOptions | None = None, - default_fragment_scan_options: ParquetFragmentScanOptions | None = None, - **kwargs, - ) -> None: ... - @property - def read_options(self) -> ParquetReadOptions: ... - def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] - def equals(self, other: ParquetFileFormat) -> bool: ... - @property - def default_extname(self) -> str: ... - def make_fragment( - self, - file: StrPath | IO, - filesystem: SupportedFileSystem | None = None, - partition_expression: Expression | None = None, - row_groups: Iterable[int] | None = None, - *, - file_size: int | None = None, - ) -> Fragment: - """ - Make a FileFragment from a given file. - - Parameters - ---------- - file : file-like object, path-like or str - The file or file path to make a fragment from. - filesystem : Filesystem, optional - If `filesystem` is given, `file` must be a string and specifies - the path of the file to read from the filesystem. - partition_expression : Expression, optional - An expression that is guaranteed true for all rows in the fragment. Allows - fragment to be potentially skipped while scanning with a filter. - row_groups : Iterable, optional - The indices of the row groups to include - file_size : int, optional - The size of the file in bytes. Can improve performance with high-latency filesystems - when file size needs to be known before reading. - - Returns - ------- - fragment : Fragment - The file fragment - """ - -class _NameStats(TypedDict): - min: Any - max: Any - -class RowGroupInfo: - """ - A wrapper class for RowGroup information - - Parameters - ---------- - id : integer - The group ID. - metadata : FileMetaData - The rowgroup metadata. - schema : Schema - Schema of the rows. - """ - - id: int - metadata: FileMetaData - schema: Schema - - def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... - @property - def num_rows(self) -> int: ... - @property - def total_byte_size(self) -> int: ... - @property - def statistics(self) -> dict[str, _NameStats]: ... - -class ParquetFileFragment(FileFragment): - """A Fragment representing a parquet file.""" - - def ensure_complete_metadata(self) -> None: ... - @property - def row_groups(self) -> list[RowGroupInfo]: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def num_row_groups(self) -> int: - """ - Return the number of row groups viewed by this fragment (not the - number of row groups in the origin file). - """ - def split_by_row_group( - self, filter: Expression | None = None, schema: Schema | None = None - ) -> list[Fragment]: - """ - Split the fragment into multiple fragments. - - Yield a Fragment wrapping each row group in this ParquetFileFragment. - Row groups will be excluded whose metadata contradicts the optional - filter. - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - - Returns - ------- - A list of Fragments - """ - def subset( - self, - filter: Expression | None = None, - schema: Schema | None = None, - row_group_ids: list[int] | None = None, - ) -> ParquetFileFormat: - """ - Create a subset of the fragment (viewing a subset of the row groups). - - Subset can be specified by either a filter predicate (with optional - schema) or by a list of row group IDs. Note that when using a filter, - the resulting fragment can be empty (viewing no row groups). - - Parameters - ---------- - filter : Expression, default None - Only include the row groups which satisfy this predicate (using - the Parquet RowGroup statistics). - schema : Schema, default None - Schema to use when filtering row groups. Defaults to the - Fragment's physical schema - row_group_ids : list of ints - The row group IDs to include in the subset. Can only be specified - if `filter` is None. - - Returns - ------- - ParquetFileFragment - """ - -class ParquetReadOptions(_Weakrefable): - """ - Parquet format specific options for reading. - - Parameters - ---------- - dictionary_columns : list of string, default None - Names of columns which should be dictionary encoded as - they are read - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds - """ - def __init__( - self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None - ) -> None: ... - @property - def coerce_int96_timestamp_unit(self) -> str: ... - @coerce_int96_timestamp_unit.setter - def coerce_int96_timestamp_unit(self, unit: str) -> None: ... - def equals(self, other: ParquetReadOptions) -> bool: ... - -class ParquetFileWriteOptions(FileWriteOptions): - def update(self, **kwargs) -> None: ... - def _set_properties(self) -> None: ... - def _set_arrow_properties(self) -> None: ... - def _set_encryption_config(self) -> None: ... - -@dataclass(kw_only=True) -class ParquetFragmentScanOptions(FragmentScanOptions): - """ - Scan-specific options for Parquet fragments. - - Parameters - ---------- - use_buffered_stream : bool, default False - Read files through buffered input streams rather than loading entire - row groups at once. This may be enabled to reduce memory overhead. - Disabled by default. - buffer_size : int, default 8192 - Size of buffered stream, if enabled. Default is 8KB. - pre_buffer : bool, default True - If enabled, pre-buffer the raw Parquet data instead of issuing one - read per column chunk. This can improve performance on high-latency - filesystems (e.g. S3, GCS) by coalescing and issuing file reads in - parallel using a background I/O thread pool. - Set to False if you want to prioritize minimal memory usage - over maximum speed. - cache_options : pyarrow.CacheOptions, default None - Cache options used when pre_buffer is enabled. The default values should - be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None - If not None, use the provided ParquetDecryptionConfig to decrypt the - Parquet file. - decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None - If not None, use the provided FileDecryptionProperties to decrypt encrypted - Parquet file. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - """ - - use_buffered_stream: bool = False - buffer_size: int = 8192 - pre_buffer: bool = True - cache_options: CacheOptions | None = None - thrift_string_size_limit: int | None = None - thrift_container_size_limit: int | None = None - decryption_config: ParquetDecryptionConfig | None = None - decryption_properties: FileDecryptionProperties | None = None - page_checksum_verification: bool = False - - def equals(self, other: ParquetFragmentScanOptions) -> bool: ... - -@dataclass -class ParquetFactoryOptions(_Weakrefable): - """ - Influences the discovery of parquet dataset. - - Parameters - ---------- - partition_base_dir : str, optional - For the purposes of applying the partitioning, paths will be - stripped of the partition_base_dir. Files not matching the - partition_base_dir prefix will be skipped for partitioning discovery. - The ignored files will still be part of the Dataset, but will not - have partition information. - partitioning : Partitioning, PartitioningFactory, optional - The partitioning scheme applied to fragments, see ``Partitioning``. - validate_column_chunk_paths : bool, default False - Assert that all ColumnChunk paths are consistent. The parquet spec - allows for ColumnChunk data to be stored in multiple files, but - ParquetDatasetFactory supports only a single file with all ColumnChunk - data. If this flag is set construction of a ParquetDatasetFactory will - raise an error if ColumnChunk data is not resident in a single file. - """ - - partition_base_dir: str | None = None - partitioning: Partitioning | PartitioningFactory | None = None - validate_column_chunk_paths: bool = False - -class ParquetDatasetFactory(DatasetFactory): - """ - Create a ParquetDatasetFactory from a Parquet `_metadata` file. - - Parameters - ---------- - metadata_path : str - Path to the `_metadata` parquet metadata-only file generated with - `pyarrow.parquet.write_metadata`. - filesystem : pyarrow.fs.FileSystem - Filesystem to read the metadata_path from, and subsequent parquet - files. - format : ParquetFileFormat - Parquet format options. - options : ParquetFactoryOptions, optional - Various flags influencing the discovery of filesystem paths. - """ - def __init__( - self, - metadata_path: str, - filesystem: SupportedFileSystem, - format: FileFormat, - options: ParquetFactoryOptions | None = None, - ) -> None: ... diff --git a/pyarrow-stubs/_dataset_parquet_encryption.pyi b/pyarrow-stubs/_dataset_parquet_encryption.pyi deleted file mode 100644 index 7623275b865..00000000000 --- a/pyarrow-stubs/_dataset_parquet_encryption.pyi +++ /dev/null @@ -1,85 +0,0 @@ -from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions -from ._parquet import FileDecryptionProperties -from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig -from .lib import _Weakrefable - -class ParquetEncryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level encryption - within the Parquet framework. - - The ParquetEncryptionConfig class serves as a bridge for passing encryption-related - parameters to the appropriate components within the Parquet library. It maintains references - to objects that define the encryption strategy, Key Management Service (KMS) configuration, - and specific encryption configurations for Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for - creating cryptographic components, such as encryptors and decryptors. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration - parameters necessary for connecting to a Key Management Service (KMS). - encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration - Shared pointer to an `EncryptionConfiguration` object. This object defines specific - encryption settings for Parquet data, including the keys assigned to different columns. - - Raises - ------ - ValueError - Raised if `encryption_config` is None. - """ - def __init__( - self, - crypto_factory: CryptoFactory, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> None: ... - -class ParquetDecryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level decryption - within the Parquet framework. - - ParquetDecryptionConfig is designed to pass decryption-related parameters to - the appropriate decryption components within the Parquet library. It holds references to - objects that define the decryption strategy, Key Management Service (KMS) configuration, - and specific decryption configurations for reading encrypted Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic - components for the decryption process. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary - for connecting to a Key Management Service (KMS) during decryption. - decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration - Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings - for reading encrypted Parquet data. - - Raises - ------ - ValueError - Raised if `decryption_config` is None. - """ - def __init__( - self, - crypto_factory: CryptoFactory, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> None: ... - -def set_encryption_config( - opts: ParquetFileWriteOptions, - config: ParquetEncryptionConfig, -) -> None: ... -def set_decryption_properties( - opts: ParquetFragmentScanOptions, - config: FileDecryptionProperties, -): ... -def set_decryption_config( - opts: ParquetFragmentScanOptions, - config: ParquetDecryptionConfig, -): ... diff --git a/pyarrow-stubs/_feather.pyi b/pyarrow-stubs/_feather.pyi deleted file mode 100644 index 8bb914ba45d..00000000000 --- a/pyarrow-stubs/_feather.pyi +++ /dev/null @@ -1,29 +0,0 @@ -from typing import IO - -from _typeshed import StrPath - -from .lib import Buffer, NativeFile, Table, _Weakrefable - -class FeatherError(Exception): ... - -def write_feather( - table: Table, - dest: StrPath | IO | NativeFile, - compression: str | None = None, - compression_level: int | None = None, - chunksize: int | None = None, - version: int = 2, -): ... - -class FeatherReader(_Weakrefable): - def __init__( - self, - source: StrPath | IO | NativeFile | Buffer, - use_memory_map: bool, - use_threads: bool, - ) -> None: ... - @property - def version(self) -> str: ... - def read(self) -> Table: ... - def read_indices(self, indices: list[int]) -> Table: ... - def read_names(self, names: list[str]) -> Table: ... diff --git a/pyarrow-stubs/_flight.pyi b/pyarrow-stubs/_flight.pyi deleted file mode 100644 index 4450c42df49..00000000000 --- a/pyarrow-stubs/_flight.pyi +++ /dev/null @@ -1,1380 +0,0 @@ -import asyncio -import enum -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar - -from typing_extensions import deprecated - -from .ipc import _ReadPandasMixin -from .lib import ( - ArrowCancelled, - ArrowException, - ArrowInvalid, - Buffer, - IpcReadOptions, - IpcWriteOptions, - RecordBatch, - RecordBatchReader, - Schema, - Table, - TimestampScalar, - _CRecordBatchWriter, - _Weakrefable, -) - -_T = TypeVar("_T") - -class FlightCallOptions(_Weakrefable): - """RPC-layer options for a Flight call.""" - - def __init__( - self, - timeout: float | None = None, - write_options: IpcWriteOptions | None = None, - headers: list[tuple[str, str]] | None = None, - read_options: IpcReadOptions | None = None, - ) -> None: - """Create call options. - - Parameters - ---------- - timeout : float, None - A timeout for the call, in seconds. None means that the - timeout defaults to an implementation-specific value. - write_options : pyarrow.ipc.IpcWriteOptions, optional - IPC write options. The default options can be controlled - by environment variables (see pyarrow.ipc). - headers : List[Tuple[str, str]], optional - A list of arbitrary headers as key, value tuples - read_options : pyarrow.ipc.IpcReadOptions, optional - Serialization options for reading IPC format. - """ - -class CertKeyPair(NamedTuple): - """A TLS certificate and key for use in Flight.""" - - cert: str - key: str - -class FlightError(Exception): - """ - The base class for Flight-specific errors. - - A server may raise this class or one of its subclasses to provide - a more detailed error to clients. - - Parameters - ---------- - message : str, optional - The error message. - extra_info : bytes, optional - Extra binary error details that were provided by the - server/will be sent to the client. - - Attributes - ---------- - extra_info : bytes - Extra binary error details that were provided by the - server/will be sent to the client. - """ - - extra_info: bytes - -class FlightInternalError(FlightError, ArrowException): - """An error internal to the Flight server occurred.""" - -class FlightTimedOutError(FlightError, ArrowException): - """The Flight RPC call timed out.""" - -class FlightCancelledError(FlightError, ArrowCancelled): - """The operation was cancelled.""" - -class FlightServerError(FlightError, ArrowException): - """A server error occurred.""" - -class FlightUnauthenticatedError(FlightError, ArrowException): - """The client is not authenticated.""" - -class FlightUnauthorizedError(FlightError, ArrowException): - """The client is not authorized to perform the given operation.""" - -class FlightUnavailableError(FlightError, ArrowException): - """The server is not reachable or available.""" - -class FlightWriteSizeExceededError(ArrowInvalid): - """A write operation exceeded the client-configured limit.""" - - limit: int - actual: int - -class Action(_Weakrefable): - """An action executable on a Flight service.""" - - def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: - """Create an action from a type and a buffer. - - Parameters - ---------- - action_type : bytes or str - buf : Buffer or bytes-like object - """ - @property - def type(self) -> str: - """The action type.""" - @property - def body(self) -> Buffer: - """The action body (arguments for the action).""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - -class ActionType(NamedTuple): - """A type of action that is executable on a Flight service.""" - - type: str - description: str - - def make_action(self, buf: Buffer | bytes) -> Action: - """Create an Action with this type. - - Parameters - ---------- - buf : obj - An Arrow buffer or Python bytes or bytes-like object. - """ - -class Result(_Weakrefable): - """A result from executing an Action.""" - def __init__(self, buf: Buffer | bytes) -> None: - """Create a new result. - - Parameters - ---------- - buf : Buffer or bytes-like object - """ - @property - def body(self) -> Buffer: - """Get the Buffer containing the result.""" - def serialize(self) -> bytes: - """Get the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - @classmethod - def deserialize(cls, serialized: bytes) -> Self: - """Parse the wire-format representation of this type. - - Useful when interoperating with non-Flight systems (e.g. REST - services) that may want to return Flight types. - - """ - -class BasicAuth(_Weakrefable): - """A container for basic auth.""" - def __init__( - self, username: str | bytes | None = None, password: str | bytes | None = None - ) -> None: - """Create a new basic auth object. - - Parameters - ---------- - username : string - password : string - """ - @property - def username(self) -> bytes: ... - @property - def password(self) -> bytes: ... - def serialize(self) -> str: ... - @staticmethod - def deserialize(serialized: str | bytes) -> BasicAuth: ... - -class DescriptorType(enum.Enum): - """ - The type of a FlightDescriptor. - - Attributes - ---------- - - UNKNOWN - An unknown descriptor type. - - PATH - A Flight stream represented by a path. - - CMD - A Flight stream represented by an application-defined command. - - """ - - UNKNOWN = 0 - PATH = 1 - CMD = 2 - -class FlightMethod(enum.Enum): - """The implemented methods in Flight.""" - - INVALID = 0 - HANDSHAKE = 1 - LIST_FLIGHTS = 2 - GET_FLIGHT_INFO = 3 - GET_SCHEMA = 4 - DO_GET = 5 - DO_PUT = 6 - DO_ACTION = 7 - LIST_ACTIONS = 8 - DO_EXCHANGE = 9 - -class FlightDescriptor(_Weakrefable): - """A description of a data stream available from a Flight service.""" - @staticmethod - def for_path(*path: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for a resource path.""" - - @staticmethod - def for_command(command: str | bytes) -> FlightDescriptor: - """Create a FlightDescriptor for an opaque command.""" - @property - def descriptor_type(self) -> DescriptorType: - """Get the type of this descriptor.""" - @property - def path(self) -> list[bytes] | None: - """Get the path for this descriptor.""" - @property - def command(self) -> bytes | None: - """Get the command for this descriptor.""" - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class Ticket(_Weakrefable): - """A ticket for requesting a Flight stream.""" - def __init__(self, ticket: str | bytes) -> None: ... - @property - def ticket(self) -> bytes: ... - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class Location(_Weakrefable): - """The location of a Flight service.""" - def __init__(self, uri: str | bytes) -> None: ... - @property - def uri(self) -> bytes: ... - def equals(self, other: Location) -> bool: ... - @staticmethod - def for_grpc_tcp(host: str | bytes, port: int) -> Location: - """Create a Location for a TCP-based gRPC service.""" - @staticmethod - def for_grpc_tls(host: str | bytes, port: int) -> Location: - """Create a Location for a TLS-based gRPC service.""" - @staticmethod - def for_grpc_unix(path: str | bytes) -> Location: - """Create a Location for a domain socket-based gRPC service.""" - -class FlightEndpoint(_Weakrefable): - """A Flight stream, along with the ticket and locations to access it.""" - def __init__( - self, - ticket: Ticket | str | bytes, - locations: list[str | Location], - expiration_time: TimestampScalar | None = ..., - app_metadata: bytes | str = ..., - ): - """Create a FlightEndpoint from a ticket and list of locations. - - Parameters - ---------- - ticket : Ticket or bytes - the ticket needed to access this flight - locations : list of string URIs - locations where this flight is available - expiration_time : TimestampScalar, default None - Expiration time of this stream. If present, clients may assume - they can retry DoGet requests. Otherwise, clients should avoid - retrying DoGet requests. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - - Raises - ------ - ArrowException - If one of the location URIs is not a valid URI. - """ - @property - def ticket(self) -> Ticket: - """Get the ticket in this endpoint.""" - @property - def locations(self) -> list[Location]: - """Get locations where this flight is available.""" - def serialize(self) -> bytes: ... - @property - def expiration_time(self) -> TimestampScalar | None: - """Get the expiration time of this stream. - - If present, clients may assume they can retry DoGet requests. - Otherwise, clients should avoid retrying DoGet requests. - - """ - @property - def app_metadata(self) -> bytes | str: - """Get application-defined opaque metadata.""" - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class SchemaResult(_Weakrefable): - """The serialized schema returned from a GetSchema request.""" - def __init__(self, schema: Schema) -> None: - """Create a SchemaResult from a schema. - - Parameters - ---------- - schema: Schema - the schema of the data in this flight. - """ - @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class FlightInfo(_Weakrefable): - """A description of a Flight stream.""" - def __init__( - self, - schema: Schema, - descriptor: FlightDescriptor, - endpoints: list[FlightEndpoint], - total_records: int = ..., - total_bytes: int = ..., - ordered: bool = ..., - app_metadata: bytes | str = ..., - ) -> None: - """Create a FlightInfo object from a schema, descriptor, and endpoints. - - Parameters - ---------- - schema : Schema - the schema of the data in this flight. - descriptor : FlightDescriptor - the descriptor for this flight. - endpoints : list of FlightEndpoint - a list of endpoints where this flight is available. - total_records : int, default None - the total records in this flight, -1 or None if unknown. - total_bytes : int, default None - the total bytes in this flight, -1 or None if unknown. - ordered : boolean, default False - Whether endpoints are in the same order as the data. - app_metadata : bytes or str, default "" - Application-defined opaque metadata. - """ - @property - def schema(self) -> Schema: - """The schema of the data in this flight.""" - @property - def descriptor(self) -> FlightDescriptor: - """The descriptor of the data in this flight.""" - @property - def endpoints(self) -> list[FlightEndpoint]: - """The endpoints where this flight is available.""" - @property - def total_records(self) -> int: - """The total record count of this flight, or -1 if unknown.""" - @property - def total_bytes(self) -> int: - """The size in bytes of the data in this flight, or -1 if unknown.""" - @property - def ordered(self) -> bool: - """Whether endpoints are in the same order as the data.""" - @property - def app_metadata(self) -> bytes | str: - """ - Application-defined opaque metadata. - - There is no inherent or required relationship between this and the - app_metadata fields in the FlightEndpoints or resulting FlightData - messages. Since this metadata is application-defined, a given - application could define there to be a relationship, but there is - none required by the spec. - - """ - def serialize(self) -> bytes: ... - @classmethod - def deserialize(cls, serialized: bytes) -> Self: ... - -class FlightStreamChunk(_Weakrefable): - """A RecordBatch with application metadata on the side.""" - @property - def data(self) -> RecordBatch | None: ... - @property - def app_metadata(self) -> Buffer | None: ... - def __iter__(self): ... - -class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): - """A reader for Flight streams.""" - - # Needs to be separate class so the "real" class can subclass the - # pure-Python mixin class - - def __iter__(self) -> Self: ... - def __next__(self) -> FlightStreamChunk: ... - @property - def schema(self) -> Schema: - """Get the schema for this reader.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" - def read_chunk(self) -> FlightStreamChunk: - """Read the next FlightStreamChunk along with any metadata. - - Returns - ------- - chunk : FlightStreamChunk - The next FlightStreamChunk in the stream. - - Raises - ------ - StopIteration - when the stream is finished - """ - def to_reader(self) -> RecordBatchReader: - """Convert this reader into a regular RecordBatchReader. - - This may fail if the schema cannot be read from the remote end. - - Returns - ------- - RecordBatchReader - """ - -class MetadataRecordBatchReader(_MetadataRecordBatchReader): - """The base class for readers for Flight streams. - - See Also - -------- - FlightStreamReader - """ - -class FlightStreamReader(MetadataRecordBatchReader): - """A reader that can also be canceled.""" - def cancel(self) -> None: - """Cancel the read operation.""" - def read_all(self) -> Table: - """Read the entire contents of the stream as a Table.""" - -class MetadataRecordBatchWriter(_CRecordBatchWriter): - """A RecordBatchWriter that also allows writing application metadata. - - This class is a context manager; on exit, close() will be called. - """ - - def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: - """Prepare to write data to this stream with the given schema.""" - def write_metadata(self, buf: Buffer) -> None: - """Write Flight metadata by itself.""" - def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] - """ - Write RecordBatch to stream. - - Parameters - ---------- - batch : RecordBatch - """ - def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: - """ - Write Table to stream in (contiguous) RecordBatch objects. - - Parameters - ---------- - table : Table - max_chunksize : int, default None - Maximum number of rows for RecordBatch chunks. Individual chunks may - be smaller depending on the chunk layout of individual columns. - """ - def close(self) -> None: - """ - Close stream and write end-of-stream 0 marker. - """ - def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: - """Write a RecordBatch along with Flight metadata. - - Parameters - ---------- - batch : RecordBatch - The next RecordBatch in the stream. - buf : Buffer - Application-specific metadata for the batch as defined by - Flight. - """ - -class FlightStreamWriter(MetadataRecordBatchWriter): - """A writer that also allows closing the write side of a stream.""" - def done_writing(self) -> None: - """Indicate that the client is done writing, but not done reading.""" - -class FlightMetadataReader(_Weakrefable): - """A reader for Flight metadata messages sent during a DoPut.""" - def read(self) -> Buffer | None: - """Read the next metadata message.""" - -class FlightMetadataWriter(_Weakrefable): - """A sender for Flight metadata messages during a DoPut.""" - def write(self, message: Buffer) -> None: - """Write the next metadata message. - - Parameters - ---------- - message : Buffer - """ - -class AsyncioCall(Generic[_T]): - """State for an async RPC using asyncio.""" - - _future: asyncio.Future[_T] - - def as_awaitable(self) -> asyncio.Future[_T]: ... - def wakeup(self, result_or_exception: BaseException | _T) -> None: ... - -class AsyncioFlightClient: - """ - A FlightClient with an asyncio-based async interface. - - This interface is EXPERIMENTAL. - """ - - def __init__(self, client: FlightClient) -> None: ... - async def get_flight_info( - self, - descriptor: FlightDescriptor, - *, - options: FlightCallOptions | None = None, - ): ... - -class FlightClient(_Weakrefable): - """A client to a Flight service. - - Connect to a Flight service on the given host and port. - - Parameters - ---------- - location : str, tuple or Location - Location to connect to. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - tls_root_certs : bytes or None - PEM-encoded - cert_chain: bytes or None - Client certificate if using mutual TLS - private_key: bytes or None - Client private key for cert_chain is using mutual TLS - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list optional, default None - A list of ClientMiddlewareFactory instances. - write_size_limit_bytes : int optional, default None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean optional, default False - A flag that indicates that, if the client is connecting - with TLS, that it skips server verification. If this is - enabled, all other TLS settings are overridden. - generic_options : list optional, default None - A list of generic (string, int or string) option tuples passed - to the underlying transport. Effect is implementation - dependent. - """ - def __init__( - self, - location: str | tuple[str, int] | Location, - *, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - middleware: list[ClientMiddlewareFactory] | None = None, - write_size_limit_bytes: int | None = None, - disable_server_verification: bool = False, - generic_options: list[tuple[str, int | str]] | None = None, - ): ... - @property - def supports_async(self) -> bool: ... - def as_async(self) -> AsyncioFlightClient: ... - def wait_for_available(self, timeout: int = 5) -> None: - """Block until the server can be contacted. - - Parameters - ---------- - timeout : int, default 5 - The maximum seconds to wait. - """ - @deprecated( - "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." - ) - @classmethod - def connect( - cls, - location: str | tuple[str, int] | Location, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - disable_server_verification: bool = False, - ) -> FlightClient: - """Connect to a Flight server. - - .. deprecated:: 0.15.0 - Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. - """ - def authenticate( - self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None - ) -> None: - """Authenticate to the server. - - Parameters - ---------- - auth_handler : ClientAuthHandler - The authentication mechanism to use. - options : FlightCallOptions - Options for this call. - """ - def authenticate_basic_token( - self, username: str, password: str, options: FlightCallOptions | None = None - ) -> tuple[str, str]: - """Authenticate to the server with HTTP basic authentication. - - Parameters - ---------- - username : string - Username to authenticate with - password : string - Password to authenticate with - options : FlightCallOptions - Options for this call - - Returns - ------- - tuple : Tuple[str, str] - A tuple representing the FlightCallOptions authorization - header entry of a bearer token. - """ - def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: - """List the actions available on a service.""" - def do_action( - self, action: Action, options: FlightCallOptions | None = None - ) -> Iterator[Result]: - """ - Execute an action on a service. - - Parameters - ---------- - action : str, tuple, or Action - Can be action type name (no body), type and body, or any Action - object - options : FlightCallOptions - RPC options - - Returns - ------- - results : iterator of Result values - """ - def list_flights( - self, criteria: str | None = None, options: FlightCallOptions | None = None - ) -> Generator[FlightInfo, None, None]: - """List the flights available on a service.""" - def get_flight_info( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> FlightInfo: - """Request information about an available flight.""" - def get_schema( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> Schema: - """Request schema for an available flight.""" - def do_get( - self, ticket: Ticket, options: FlightCallOptions | None = None - ) -> FlightStreamReader: - """Request the data for a flight. - - Returns - ------- - reader : FlightStreamReader - """ - def do_put( - self, - descriptor: FlightDescriptor, - schema: Schema, - options: FlightCallOptions | None = None, - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Upload data to a flight. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightMetadataReader - """ - def do_exchange( - self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None - ) -> tuple[FlightStreamWriter, FlightStreamReader]: - """Start a bidirectional data exchange with a server. - - Parameters - ---------- - descriptor : FlightDescriptor - A descriptor for the flight. - options : FlightCallOptions - RPC options. - - Returns - ------- - writer : FlightStreamWriter - reader : FlightStreamReader - """ - def close(self) -> None: - """Close the client and disconnect.""" - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, traceback) -> None: ... - -class FlightDataStream(_Weakrefable): - """ - Abstract base class for Flight data streams. - - See Also - -------- - RecordBatchStream - GeneratorStream - """ - -class RecordBatchStream(FlightDataStream): - """A Flight data stream backed by RecordBatches. - - The remainder of this DoGet request will be handled in C++, - without having to acquire the GIL. - - """ - def __init__( - self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None - ) -> None: - """Create a RecordBatchStream from a data source. - - Parameters - ---------- - data_source : RecordBatchReader or Table - The data to stream to the client. - options : pyarrow.ipc.IpcWriteOptions, optional - Optional IPC options to control how to write the data. - """ - -class GeneratorStream(FlightDataStream): - """A Flight data stream backed by a Python generator.""" - def __init__( - self, - schema: Schema, - generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], - options: IpcWriteOptions | None = None, - ) -> None: - """Create a GeneratorStream from a Python generator. - - Parameters - ---------- - schema : Schema - The schema for the data to be returned. - - generator : iterator or iterable - The generator should yield other FlightDataStream objects, - Tables, RecordBatches, or RecordBatchReaders. - - options : pyarrow.ipc.IpcWriteOptions, optional - """ - -class ServerCallContext(_Weakrefable): - """Per-call state/context.""" - def peer_identity(self) -> bytes: - """Get the identity of the authenticated peer. - - May be the empty string. - """ - def peer(self) -> str: - """Get the address of the peer.""" - # Set safe=True as gRPC on Windows sometimes gives garbage bytes - def is_cancelled(self) -> bool: - """Check if the current RPC call has been canceled by the client.""" - def add_header(self, key: str, value: str) -> None: - """Add a response header.""" - def add_trailer(self, key: str, value: str) -> None: - """Add a response trailer.""" - def get_middleware(self, key: str) -> ServerMiddleware | None: - """ - Get a middleware instance by key. - - Returns None if the middleware was not found. - """ - -class ServerAuthReader(_Weakrefable): - """A reader for messages from the client during an auth handshake.""" - def read(self) -> str: ... - -class ServerAuthSender(_Weakrefable): - """A writer for messages to the client during an auth handshake.""" - def write(self, message: str) -> None: ... - -class ClientAuthReader(_Weakrefable): - """A reader for messages from the server during an auth handshake.""" - def read(self) -> str: ... - -class ClientAuthSender(_Weakrefable): - """A writer for messages to the server during an auth handshake.""" - def write(self, message: str) -> None: ... - -class ServerAuthHandler(_Weakrefable): - """Authentication middleware for a server. - - To implement an authentication mechanism, subclass this class and - override its methods. - - """ - def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): - """Conduct the handshake with the client. - - May raise an error if the client cannot authenticate. - - Parameters - ---------- - outgoing : ServerAuthSender - A channel to send messages to the client. - incoming : ServerAuthReader - A channel to read messages from the client. - """ - def is_valid(self, token: str) -> bool: - """Validate a client token, returning their identity. - - May return an empty string (if the auth mechanism does not - name the peer) or raise an exception (if the token is - invalid). - - Parameters - ---------- - token : bytes - The authentication token from the client. - - """ - -class ClientAuthHandler(_Weakrefable): - """Authentication plugin for a client.""" - def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): - """Conduct the handshake with the server. - - Parameters - ---------- - outgoing : ClientAuthSender - A channel to send messages to the server. - incoming : ClientAuthReader - A channel to read messages from the server. - """ - def get_token(self) -> str: - """Get the auth token for a call.""" - -class CallInfo(NamedTuple): - """Information about a particular RPC for Flight middleware.""" - - method: FlightMethod - -class ClientMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - client are accessible from the middleware itself. - - """ - def start_call(self, info: CallInfo) -> ClientMiddleware | None: - """Called at the start of an RPC. - - This must be thread-safe and must not raise exceptions. - - Parameters - ---------- - info : CallInfo - Information about the call. - - Returns - ------- - instance : ClientMiddleware - An instance of ClientMiddleware (the instance to use for - the call), or None if this call is not intercepted. - - """ - -class ClientMiddleware(_Weakrefable): - """Client-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - - """ - - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. - - Returns - ------- - headers : dict - A dictionary of header values to add to the request, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. - - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - - def received_headers(self, headers: dict[str, list[str] | list[bytes]]): - """A callback when headers are received. - - The default implementation does nothing. - - Parameters - ---------- - headers : dict - A dictionary of headers from the server. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - - """ - - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - - The default implementation does nothing. - - Parameters - ---------- - exception : ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ - -class ServerMiddlewareFactory(_Weakrefable): - """A factory for new middleware instances. - - All middleware methods will be called from the same thread as the - RPC method implementation. That is, thread-locals set in the - middleware are accessible from the method itself. - - """ - - def start_call( - self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> ServerMiddleware | None: - """Called at the start of an RPC. - - This must be thread-safe. - - Parameters - ---------- - info : CallInfo - Information about the call. - headers : dict - A dictionary of headers from the client. Keys are strings - and values are lists of strings (for text headers) or - bytes (for binary headers). - - Returns - ------- - instance : ServerMiddleware - An instance of ServerMiddleware (the instance to use for - the call), or None if this call is not intercepted. - - Raises - ------ - exception : pyarrow.ArrowException - If an exception is raised, the call will be rejected with - the given error. - - """ - -class TracingServerMiddlewareFactory(ServerMiddlewareFactory): - """A factory for tracing middleware instances. - - This enables OpenTelemetry support in Arrow (if Arrow was compiled - with OpenTelemetry support enabled). A new span will be started on - each RPC call. The TracingServerMiddleware instance can then be - retrieved within an RPC handler to get the propagated context, - which can be used to start a new span on the Python side. - - Because the Python/C++ OpenTelemetry libraries do not - interoperate, spans on the C++ side are not directly visible to - the Python side and vice versa. - - """ - -class ServerMiddleware(_Weakrefable): - """Server-side middleware for a call, instantiated per RPC. - - Methods here should be fast and must be infallible: they should - not raise exceptions or stall indefinitely. - - """ - - def sending_headers(self) -> dict[str, list[str] | list[bytes]]: - """A callback before headers are sent. - - Returns - ------- - headers : dict - A dictionary of header values to add to the response, or - None if no headers are to be added. The dictionary should - have string keys and string or list-of-string values. - - Bytes values are allowed, but the underlying transport may - not support them or may restrict them. For gRPC, binary - values are only allowed on headers ending in "-bin". - - Header names must be lowercase ASCII. - - """ - def call_completed(self, exception: ArrowException): - """A callback when the call finishes. - - Parameters - ---------- - exception : pyarrow.ArrowException - If the call errored, this is the equivalent - exception. Will be None if the call succeeded. - - """ - -class TracingServerMiddleware(ServerMiddleware): - trace_context: dict - def __init__(self, trace_context: dict) -> None: ... - -class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): - """Wrapper to bundle server middleware into a single C++ one.""" - - def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... - def start_call( # type: ignore[override] - self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] - ) -> _ServerMiddlewareFactoryWrapper | None: ... - -class _ServerMiddlewareWrapper(ServerMiddleware): - def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... - def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... - def call_completed(self, exception: ArrowException) -> None: ... - -class _FlightServerFinalizer(_Weakrefable): - """ - A finalizer that shuts down the server on destruction. - - See ARROW-16597. If the server is still active at interpreter - exit, the process may segfault. - """ - - def finalize(self) -> None: ... - -class FlightServerBase(_Weakrefable): - """A Flight service definition. - - To start the server, create an instance of this class with an - appropriate location. The server will be running as soon as the - instance is created; it is not required to call :meth:`serve`. - - Override methods to define your Flight service. - - Parameters - ---------- - location : str, tuple or Location optional, default None - Location to serve on. Either a gRPC URI like `grpc://localhost:port`, - a tuple of (host, port) pair, or a Location instance. - If None is passed then the server will be started on localhost with a - system provided random port. - auth_handler : ServerAuthHandler optional, default None - An authentication mechanism to use. May be None. - tls_certificates : list optional, default None - A list of (certificate, key) pairs. - verify_client : boolean optional, default False - If True, then enable mutual TLS: require the client to present - a client certificate, and validate the certificate. - root_certificates : bytes optional, default None - If enabling mutual TLS, this specifies the PEM-encoded root - certificate used to validate client certificates. - middleware : dict optional, default None - A dictionary of :class:`ServerMiddlewareFactory` instances. The - string keys can be used to retrieve the middleware instance within - RPC handlers (see :meth:`ServerCallContext.get_middleware`). - - """ - def __init__( - self, - location: str | tuple[str, int] | Location | None = None, - auth_handler: ServerAuthHandler | None = None, - tls_certificates: list[tuple[str, str]] | None = None, - verify_client: bool = False, - root_certificates: str | None = None, - middleware: dict[str, ServerMiddlewareFactory] | None = None, - ): ... - @property - def port(self) -> int: - """ - Get the port that this server is listening on. - - Returns a non-positive value if the operation is invalid - (e.g. init() was not called or server is listening on a domain - socket). - """ - def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: - """List flights available on this service. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - criteria : bytes - Filter criteria provided by the client. - - Returns - ------- - iterator of FlightInfo - - """ - def get_flight_info( - self, context: ServerCallContext, descriptor: FlightDescriptor - ) -> FlightInfo: - """Get information about a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - FlightInfo - - """ - def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: - """Get the schema of a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - - Returns - ------- - Schema - - """ - def do_put( - self, - context: ServerCallContext, - descriptor: FlightDescriptor, - reader: MetadataRecordBatchReader, - writer: FlightMetadataWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : FlightMetadataWriter - A writer to send responses to the client. - - """ - def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - ticket : Ticket - The ticket for the flight. - - Returns - ------- - FlightDataStream - A stream of data to send back to the client. - - """ - def do_exchange( - self, - context: ServerCallContext, - descriptor: FlightDescriptor, - reader: MetadataRecordBatchReader, - writer: MetadataRecordBatchWriter, - ) -> None: - """Write data to a flight. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - descriptor : FlightDescriptor - The descriptor for the flight provided by the client. - reader : MetadataRecordBatchReader - A reader for data uploaded by the client. - writer : MetadataRecordBatchWriter - A writer to send responses to the client. - - """ - def list_actions(self, context: ServerCallContext) -> Iterable[Action]: - """List custom actions available on this server. - - Applications should override this method to implement their - own behavior. The default method raises a NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - - Returns - ------- - iterator of ActionType or tuple - - """ - def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: - """Execute a custom action. - - This method should return an iterator, or it should be a - generator. Applications should override this method to - implement their own behavior. The default method raises a - NotImplementedError. - - Parameters - ---------- - context : ServerCallContext - Common contextual information. - action : Action - The action to execute. - - Returns - ------- - iterator of bytes - - """ - def serve(self) -> None: - """Block until the server shuts down. - - This method only returns if shutdown() is called or a signal is - received. - """ - def run(self) -> None: - """Block until the server shuts down. - - .. deprecated:: 0.15.0 - Use the ``FlightServer.serve`` method instead - """ - def shutdown(self) -> None: - """Shut down the server, blocking until current requests finish. - - Do not call this directly from the implementation of a Flight - method, as then the server will block forever waiting for that - request to finish. Instead, call this method from a background - thread. - - This method should only be called once. - """ - def wait(self) -> None: - """Block until server is terminated with shutdown.""" - def __enter__(self) -> Self: ... - def __exit__(self, exc_type, exc_value, traceback): ... - -def connect( - location: str | tuple[str, int] | Location, - *, - tls_root_certs: str | None = None, - cert_chain: str | None = None, - private_key: str | None = None, - override_hostname: str | None = None, - middleware: list[ClientMiddlewareFactory] | None = None, - write_size_limit_bytes: int | None = None, - disable_server_verification: bool = False, - generic_options: list[tuple[str, int | str]] | None = None, -) -> FlightClient: - """ - Connect to a Flight server. - - Parameters - ---------- - location : str, tuple, or Location - Location to connect to. Either a URI like "grpc://localhost:port", - a tuple of (host, port), or a Location instance. - tls_root_certs : bytes or None - PEM-encoded. - cert_chain: str or None - If provided, enables TLS mutual authentication. - private_key: str or None - If provided, enables TLS mutual authentication. - override_hostname : str or None - Override the hostname checked by TLS. Insecure, use with caution. - middleware : list or None - A list of ClientMiddlewareFactory instances to apply. - write_size_limit_bytes : int or None - A soft limit on the size of a data payload sent to the - server. Enabled if positive. If enabled, writing a record - batch that (when serialized) exceeds this limit will raise an - exception; the client can retry the write with a smaller - batch. - disable_server_verification : boolean or None - Disable verifying the server when using TLS. - Insecure, use with caution. - generic_options : list or None - A list of generic (string, int or string) options to pass to - the underlying transport. - - Returns - ------- - client : FlightClient - """ diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi deleted file mode 100644 index 7670ef5230d..00000000000 --- a/pyarrow-stubs/_fs.pyi +++ /dev/null @@ -1,1005 +0,0 @@ -import datetime as dt -import enum -import sys - -from abc import ABC, abstractmethod - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from typing import Union, overload - -from fsspec import AbstractFileSystem # type: ignore[import-untyped] - -from .lib import NativeFile, _Weakrefable - -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - -class FileType(enum.IntFlag): - NotFound = enum.auto() - Unknown = enum.auto() - File = enum.auto() - Directory = enum.auto() - -class FileInfo(_Weakrefable): - """ - FileSystem entry info. - - Parameters - ---------- - path : str - The full path to the filesystem entry. - type : FileType - The type of the filesystem entry. - mtime : datetime or float, default None - If given, the modification time of the filesystem entry. - If a float is given, it is the number of seconds since the - Unix epoch. - mtime_ns : int, default None - If given, the modification time of the filesystem entry, - in nanoseconds since the Unix epoch. - `mtime` and `mtime_ns` are mutually exclusive. - size : int, default None - If given, the filesystem entry size in bytes. This should only - be given if `type` is `FileType.File`. - - Examples - -------- - Generate a file: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> path_fs = local_path + "/pyarrow-fs-example.dat" - >>> with local.open_output_stream(path_fs) as stream: - ... stream.write(b"data") - 4 - - Get FileInfo object using ``get_file_info()``: - - >>> file_info = local.get_file_info(path_fs) - >>> file_info - - - Inspect FileInfo attributes: - - >>> file_info.type - - - >>> file_info.is_file - True - - >>> file_info.path - '/.../pyarrow-fs-example.dat' - - >>> file_info.base_name - 'pyarrow-fs-example.dat' - - >>> file_info.size - 4 - - >>> file_info.extension - 'dat' - - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ - - def __init__( - self, - path: str, - type: FileType = FileType.Unknown, - *, - mtime: dt.datetime | float | None = None, - mtime_ns: int | None = None, - size: int | None = None, - ): ... - @property - def type(self) -> FileType: - """ - Type of the file. - - The returned enum values can be the following: - - - FileType.NotFound: target does not exist - - FileType.Unknown: target exists but its type is unknown (could be a - special file such as a Unix socket or character device, or - Windows NUL / CON / ...) - - FileType.File: target is a regular file - - FileType.Directory: target is a regular directory - - Returns - ------- - type : FileType - """ - @property - def is_file(self) -> bool: ... - @property - def path(self) -> str: - """ - The full file path in the filesystem. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.path - '/.../pyarrow-fs-example.dat' - """ - @property - def base_name(self) -> str: - """ - The file base name. - - Component after the last directory separator. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.base_name - 'pyarrow-fs-example.dat' - """ - @property - def size(self) -> int: - """ - The size in bytes, if available. - - Only regular files are guaranteed to have a size. - - Returns - ------- - size : int or None - """ - @property - def extension(self) -> str: - """ - The file extension. - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.extension - 'dat' - """ - @property - def mtime(self) -> dt.datetime | None: - """ - The time of last modification, if available. - - Returns - ------- - mtime : datetime.datetime or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime # doctest: +SKIP - datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) - """ - @property - def mtime_ns(self) -> int | None: - """ - The time of last modification, if available, expressed in nanoseconds - since the Unix epoch. - - Returns - ------- - mtime_ns : int or None - - Examples - -------- - >>> file_info = local.get_file_info(path) - >>> file_info.mtime_ns # doctest: +SKIP - 1656489370873922073 - """ - -class FileSelector(_Weakrefable): - """ - File and directory selector. - - It contains a set of options that describes how to search for files and - directories. - - Parameters - ---------- - base_dir : str - The directory in which to select files. Relative paths also work, use - '.' for the current directory and '..' for the parent. - allow_not_found : bool, default False - The behavior if `base_dir` doesn't exist in the filesystem. - If false, an error is returned. - If true, an empty selection is returned. - recursive : bool, default False - Whether to recurse into subdirectories. - - Examples - -------- - List the contents of a directory and subdirectories: - - >>> selector_1 = fs.FileSelector(local_path, recursive=True) - >>> local.get_file_info(selector_1) # doctest: +SKIP - [, - , - ] - - List only the contents of the base directory: - - >>> selector_2 = fs.FileSelector(local_path) - >>> local.get_file_info(selector_2) # doctest: +SKIP - [, - ] - - Return empty selection if the directory doesn't exist: - - >>> selector_not_found = fs.FileSelector( - ... local_path + "/missing", recursive=True, allow_not_found=True - ... ) - >>> local.get_file_info(selector_not_found) - [] - """ - - base_dir: str - allow_not_found: bool - recursive: bool - def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... - -class FileSystem(_Weakrefable): - """ - Abstract file system API. - """ - - @classmethod - def from_uri(cls, uri: str) -> tuple[Self, str]: - """ - Create a new FileSystem from URI or Path. - - Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". - In addition, the argument can be a pathlib.Path object, or a string - describing an absolute local path. - - Parameters - ---------- - uri : string - URI-based path, for example: file:///some/local/path. - - Returns - ------- - tuple of (FileSystem, str path) - With (filesystem, path) tuple where path is the abstract path - inside the FileSystem instance. - - Examples - -------- - Create a new FileSystem subclass from a URI: - - >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) - >>> local_new, path_new = fs.FileSystem.from_uri(uri) - >>> local_new - >> path_new - '/.../pyarrow-fs-example.dat' - - Or from a s3 bucket: - - >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") - (, 'usgs-landsat/collection02') - """ - def equals(self, other: FileSystem) -> bool: - """ - Parameters - ---------- - other : pyarrow.fs.FileSystem - - Returns - ------- - bool - """ - @property - def type_name(self) -> str: - """ - The filesystem's type name. - """ - @overload - def get_file_info(self, paths_or_selector: str) -> FileInfo: ... - @overload - def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... - def get_file_info(self, paths_or_selector): - """ - Get info for the given files. - - Any symlink is automatically dereferenced, recursively. A non-existing - or unreachable file returns a FileStat object and has a FileType of - value NotFound. An exception indicates a truly exceptional condition - (low-level I/O error, etc.). - - Parameters - ---------- - paths_or_selector : FileSelector, path-like or list of path-likes - Either a selector object, a path-like object or a list of - path-like objects. The selector's base directory will not be - part of the results, even if it exists. If it doesn't exist, - use `allow_not_found`. - - Returns - ------- - FileInfo or list of FileInfo - Single FileInfo object is returned for a single path, otherwise - a list of FileInfo objects is returned. - - Examples - -------- - >>> local - - >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) - - """ - def create_dir(self, path: str, *, recursive: bool = True) -> None: - """ - Create a directory and subdirectories. - - This function succeeds if the directory already exists. - - Parameters - ---------- - path : str - The path of the new directory. - recursive : bool, default True - Create nested directories as well. - """ - def delete_dir(self, path: str) -> None: - """ - Delete a directory and its contents, recursively. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - """ - def delete_dir_contents( - self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False - ) -> None: - """ - Delete a directory's contents, recursively. - - Like delete_dir, but doesn't delete the directory itself. - - Parameters - ---------- - path : str - The path of the directory to be deleted. - accept_root_dir : boolean, default False - Allow deleting the root directory's contents - (if path is empty or "/") - missing_dir_ok : boolean, default False - If False then an error is raised if path does - not exist - """ - def move(self, src: str, dest: str) -> None: - """ - Move / rename a file or directory. - - If the destination exists: - - if it is a non-empty directory, an error is returned - - otherwise, if it has the same type as the source, it is replaced - - otherwise, behavior is unspecified (implementation-dependent). - - Parameters - ---------- - src : str - The path of the file or the directory to be moved. - dest : str - The destination path where the file or directory is moved to. - - Examples - -------- - Create a new folder with a file: - - >>> local.create_dir("/tmp/other_dir") - >>> local.copy_file(path, "/tmp/move_example.dat") - - Move the file: - - >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") - - Inspect the file info: - - >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") - - >>> local.get_file_info("/tmp/move_example.dat") - - - Delete the folder: - >>> local.delete_dir("/tmp/other_dir") - """ - def copy_file(self, src: str, dest: str) -> None: - """ - Copy a file. - - If the destination exists and is a directory, an error is returned. - Otherwise, it is replaced. - - Parameters - ---------- - src : str - The path of the file to be copied from. - dest : str - The destination path where the file is copied to. - - Examples - -------- - >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") - - Inspect the file info: - - >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") - - >>> local.get_file_info(path) - - """ - def delete_file(self, path: str) -> None: - """ - Delete a file. - - Parameters - ---------- - path : str - The path of the file to be deleted. - """ - def open_input_file(self, path: str) -> NativeFile: - """ - Open an input file for random access reading. - - Parameters - ---------- - path : str - The source to open for reading. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_file()`: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data' - """ - def open_input_stream( - self, path: str, compression: str | None = "detect", buffer_size: int | None = None - ) -> NativeFile: - """ - Open an input stream for sequential reading. - - Parameters - ---------- - path : str - The source to open for reading. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly decompression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary read buffer. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Print the data from the file with `open_input_stream()`: - - >>> with local.open_input_stream(path) as f: - ... print(f.readall()) - b'data' - """ - def open_output_stream( - self, - path: str, - compression: str | None = "detect", - buffer_size: int | None = None, - metadata: dict[str, str] | None = None, - ) -> NativeFile: - """ - Open an output stream for sequential writing. - - If the target already exists, existing data is truncated. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream(path) as stream: - ... stream.write(b"data") - 4 - """ - def open_append_stream( - self, - path: str, - compression: str | None = "detect", - buffer_size: int | None = None, - metadata: dict[str, str] | None = None, - ): - """ - Open an output stream for appending. - - If the target doesn't exist, a new empty file is created. - - .. note:: - Some filesystem implementations do not support efficient - appending to an existing file, in which case this method will - raise NotImplementedError. - Consider writing to multiple files (using e.g. the dataset layer) - instead. - - Parameters - ---------- - path : str - The source to open for writing. - compression : str optional, default 'detect' - The compression algorithm to use for on-the-fly compression. - If "detect" and source is a file path, then compression will be - chosen based on the file extension. - If None, no compression will be applied. Otherwise, a well-known - algorithm name must be supplied (e.g. "gzip"). - buffer_size : int optional, default None - If None or 0, no buffering will happen. Otherwise the size of the - temporary write buffer. - metadata : dict optional, default None - If not None, a mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - Unsupported metadata keys will be ignored. - - Returns - ------- - stream : NativeFile - - Examples - -------- - Append new data to a FileSystem subclass with nonempty file: - - >>> with local.open_append_stream(path) as f: - ... f.write(b"+newly added") - 12 - - Print out the content to the file: - - >>> with local.open_input_file(path) as f: - ... print(f.readall()) - b'data+newly added' - """ - def normalize_path(self, path: str) -> str: - """ - Normalize filesystem path. - - Parameters - ---------- - path : str - The path to normalize - - Returns - ------- - normalized_path : str - The normalized path - """ - -class LocalFileSystem(FileSystem): - """ - A FileSystem implementation accessing files on the local machine. - - Details such as symlinks are abstracted away (symlinks are always followed, - except when deleting an entry). - - Parameters - ---------- - use_mmap : bool, default False - Whether open_input_stream and open_input_file should return - a mmap'ed file or a regular file. - - Examples - -------- - Create a FileSystem object with LocalFileSystem constructor: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> local - - - and write data on to the file: - - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: - ... print(stream.readall()) - b'data' - - Create a FileSystem object inferred from a URI of the saved file: - - >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") - >>> local_new - >> path - '/tmp/local_fs.dat' - - Check if FileSystems `local` and `local_new` are equal: - - >>> local.equals(local_new) - True - - Compare two different FileSystems: - - >>> local2 = fs.LocalFileSystem(use_mmap=True) - >>> local.equals(local2) - False - - Copy a file and print out the data: - - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: - ... print(stream.readall()) - b'data' - - Open an output stream for appending, add text and print the new data: - - >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: - ... f.write(b"+newly added") - 12 - - >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: - ... print(f.readall()) - b'data+newly added' - - Create a directory, copy a file into it and then delete the whole directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.delete_dir("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - - Create a directory, copy a file into it and then delete - the content of the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.delete_dir_contents("/tmp/new_folder") - >>> local.get_file_info("/tmp/new_folder") - - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - - Create a directory, copy a file into it and then delete - the file from the directory: - - >>> local.create_dir("/tmp/new_folder") - >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") - >>> local.delete_file("/tmp/new_folder/local_fs.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs.dat") - - >>> local.get_file_info("/tmp/new_folder") - - - Move the file: - - >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") - >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") - - >>> local.get_file_info("/tmp/local_fs-copy.dat") - - - To finish delete the file left: - >>> local.delete_file("/tmp/local_fs.dat") - """ - - def __init__(self, *, use_mmap: bool = False) -> None: ... - -class SubTreeFileSystem(FileSystem): - """ - Delegates to another implementation after prepending a fixed base path. - - This is useful to expose a logical view of a subtree of a filesystem, - for example a directory in a LocalFileSystem. - - Note, that this makes no security guarantee. For example, symlinks may - allow to "escape" the subtree and access other parts of the underlying - filesystem. - - Parameters - ---------- - base_path : str - The root of the subtree. - base_fs : FileSystem - FileSystem object the operations delegated to. - - Examples - -------- - Create a LocalFileSystem instance: - - >>> from pyarrow import fs - >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: - ... stream.write(b"data") - 4 - - Create a directory and a SubTreeFileSystem instance: - - >>> local.create_dir("/tmp/sub_tree") - >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) - - Write data into the existing file: - - >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: - ... f.write(b"+newly added") - 12 - - Print out the attributes: - - >>> subtree.base_fs - - >>> subtree.base_path - '/tmp/sub_tree/' - - Get info for the given directory or given file: - - >>> subtree.get_file_info("") - - >>> subtree.get_file_info("sub_tree_fs.dat") - - - Delete the file and directory: - - >>> subtree.delete_file("sub_tree_fs.dat") - >>> local.delete_dir("/tmp/sub_tree") - >>> local.delete_file("/tmp/local_fs.dat") - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - def __init__(self, base_path: str, base_fs: FileSystem): ... - @property - def base_path(self) -> str: ... - @property - def base_fs(self) -> FileSystem: ... - -class _MockFileSystem(FileSystem): - def __init__(self, current_time: dt.datetime | None = None) -> None: ... - -class PyFileSystem(FileSystem): - """ - A FileSystem with behavior implemented in Python. - - Parameters - ---------- - handler : FileSystemHandler - The handler object implementing custom filesystem behavior. - - Examples - -------- - Create an fsspec-based filesystem object for GitHub: - - >>> from fsspec.implementations import github - >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP - - Get a PyArrow FileSystem object: - - >>> from pyarrow.fs import PyFileSystem, FSSpecHandler - >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP - - Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: - - >>> pa_fs.get_file_info("README.md") # doctest: +SKIP - - """ - def __init__(self, handler: FileSystemHandler) -> None: ... - @property - def handler(self) -> FileSystemHandler: - """ - The filesystem's underlying handler. - - Returns - ------- - handler : FileSystemHandler - """ - -class FileSystemHandler(ABC): - """ - An abstract class exposing methods to implement PyFileSystem's behavior. - """ - @abstractmethod - def get_type_name(self) -> str: - """ - Implement PyFileSystem.type_name. - """ - @abstractmethod - def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(paths). - - Parameters - ---------- - paths : list of str - paths for which we want to retrieve the info. - """ - @abstractmethod - def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: - """ - Implement PyFileSystem.get_file_info(selector). - - Parameters - ---------- - selector : FileSelector - selector for which we want to retrieve the info. - """ - - @abstractmethod - def create_dir(self, path: str, recursive: bool) -> None: - """ - Implement PyFileSystem.create_dir(...). - - Parameters - ---------- - path : str - path of the directory. - recursive : bool - if the parent directories should be created too. - """ - @abstractmethod - def delete_dir(self, path: str) -> None: - """ - Implement PyFileSystem.delete_dir(...). - - Parameters - ---------- - path : str - path of the directory. - """ - @abstractmethod - def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: - """ - Implement PyFileSystem.delete_dir_contents(...). - - Parameters - ---------- - path : str - path of the directory. - missing_dir_ok : bool - if False an error should be raised if path does not exist - """ - @abstractmethod - def delete_root_dir_contents(self) -> None: - """ - Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). - """ - @abstractmethod - def delete_file(self, path: str) -> None: - """ - Implement PyFileSystem.delete_file(...). - - Parameters - ---------- - path : str - path of the file. - """ - @abstractmethod - def move(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.move(...). - - Parameters - ---------- - src : str - path of what should be moved. - dest : str - path of where it should be moved to. - """ - - @abstractmethod - def copy_file(self, src: str, dest: str) -> None: - """ - Implement PyFileSystem.copy_file(...). - - Parameters - ---------- - src : str - path of what should be copied. - dest : str - path of where it should be copied to. - """ - @abstractmethod - def open_input_stream(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ - @abstractmethod - def open_input_file(self, path: str) -> NativeFile: - """ - Implement PyFileSystem.open_input_file(...). - - Parameters - ---------- - path : str - path of what should be opened. - """ - @abstractmethod - def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_output_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ - - @abstractmethod - def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: - """ - Implement PyFileSystem.open_append_stream(...). - - Parameters - ---------- - path : str - path of what should be opened. - metadata : mapping - Mapping of string keys to string values. - Some filesystems support storing metadata along the file - (such as "Content-Type"). - """ - @abstractmethod - def normalize_path(self, path: str) -> str: - """ - Implement PyFileSystem.normalize_path(...). - - Parameters - ---------- - path : str - path of what should be normalized. - """ diff --git a/pyarrow-stubs/_gcsfs.pyi b/pyarrow-stubs/_gcsfs.pyi deleted file mode 100644 index 4fc7ea68e48..00000000000 --- a/pyarrow-stubs/_gcsfs.pyi +++ /dev/null @@ -1,83 +0,0 @@ -import datetime as dt - -from ._fs import FileSystem -from .lib import KeyValueMetadata - -class GcsFileSystem(FileSystem): - """ - Google Cloud Storage (GCS) backed FileSystem implementation - - By default uses the process described in https://google.aip.dev/auth/4110 - to resolve credentials. If not running on Google Cloud Platform (GCP), - this generally requires the environment variable - GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file - containing credentials. - - Note: GCS buckets are special and the operations available on them may be - limited or more expensive than expected compared to local file systems. - - Note: When pickling a GcsFileSystem that uses default credentials, resolution - credentials are not stored in the serialized data. Therefore, when unpickling - it is assumed that the necessary credentials are in place for the target - process. - - Parameters - ---------- - anonymous : boolean, default False - Whether to connect anonymously. - If true, will not attempt to look up credentials using standard GCP - configuration methods. - access_token : str, default None - GCP access token. If provided, temporary credentials will be fetched by - assuming this role; also, a `credential_token_expiration` must be - specified as well. - target_service_account : str, default None - An optional service account to try to impersonate when accessing GCS. This - requires the specified credential user or service account to have the necessary - permissions. - credential_token_expiration : datetime, default None - Expiration for credential generated with an access token. Must be specified - if `access_token` is specified. - default_bucket_location : str, default 'US' - GCP region to create buckets in. - scheme : str, default 'https' - GCS connection transport scheme. - endpoint_override : str, default None - Override endpoint with a connect string such as "localhost:9000" - default_metadata : mapping or pyarrow.KeyValueMetadata, default None - Default metadata for `open_output_stream`. This will be ignored if - non-empty metadata is passed to `open_output_stream`. - retry_time_limit : timedelta, default None - Set the maximum amount of time the GCS client will attempt to retry - transient errors. Subsecond granularity is ignored. - project_id : str, default None - The GCP project identifier to use for creating buckets. - If not set, the library uses the GOOGLE_CLOUD_PROJECT environment - variable. Most I/O operations do not need a project id, only applications - that create new buckets need a project id. - """ - - def __init__( - self, - *, - anonymous: bool = False, - access_token: str | None = None, - target_service_account: str | None = None, - credential_token_expiration: dt.datetime | None = None, - default_bucket_location: str = "US", - scheme: str = "https", - endpoint_override: str | None = None, - default_metadata: dict | KeyValueMetadata | None = None, - retry_time_limit: dt.timedelta | None = None, - project_id: str | None = None, - ): ... - @property - def default_bucket_location(self) -> str: - """ - The GCP location this filesystem will write to. - """ - @property - def project_id(self) -> str: - """ - The GCP project id this filesystem will use. - """ diff --git a/pyarrow-stubs/_hdfs.pyi b/pyarrow-stubs/_hdfs.pyi deleted file mode 100644 index 200f669379b..00000000000 --- a/pyarrow-stubs/_hdfs.pyi +++ /dev/null @@ -1,75 +0,0 @@ -from _typeshed import StrPath - -from ._fs import FileSystem - -class HadoopFileSystem(FileSystem): - """ - HDFS backed FileSystem implementation - - Parameters - ---------- - host : str - HDFS host to connect to. Set to "default" for fs.defaultFS from - core-site.xml. - port : int, default 8020 - HDFS port to connect to. Set to 0 for default or logical (HA) nodes. - user : str, default None - Username when connecting to HDFS; None implies login user. - replication : int, default 3 - Number of copies each block will have. - buffer_size : int, default 0 - If 0, no buffering will happen otherwise the size of the temporary read - and write buffer. - default_block_size : int, default None - None means the default configuration for HDFS, a typical block size is - 128 MB. - kerb_ticket : string or path, default None - If not None, the path to the Kerberos ticket cache. - extra_conf : dict, default None - Extra key/value pairs for configuration; will override any - hdfs-site.xml properties. - - Examples - -------- - >>> from pyarrow import fs - >>> hdfs = fs.HadoopFileSystem( - ... host, port, user=user, kerb_ticket=ticket_cache_path - ... ) # doctest: +SKIP - - For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. - """ - def __init__( - self, - host: str, - port: int = 8020, - *, - user: str | None = None, - replication: int = 3, - buffer_size: int = 0, - default_block_size: int | None = None, - kerb_ticket: StrPath | None = None, - extra_conf: dict | None = None, - ): ... - @staticmethod - def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] - """ - Instantiate HadoopFileSystem object from an URI string. - - The following two calls are equivalent - - * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ -&replication=1')`` - * ``HadoopFileSystem('localhost', port=8020, user='test', \ -replication=1)`` - - Parameters - ---------- - uri : str - A string URI describing the connection to HDFS. - In order to change the user, replication, buffer_size or - default_block_size pass the values as query parts. - - Returns - ------- - HadoopFileSystem - """ diff --git a/pyarrow-stubs/_json.pyi b/pyarrow-stubs/_json.pyi deleted file mode 100644 index 43d2ae83cd8..00000000000 --- a/pyarrow-stubs/_json.pyi +++ /dev/null @@ -1,169 +0,0 @@ -from typing import IO, Any, Literal - -from _typeshed import StrPath - -from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable - -class ReadOptions(_Weakrefable): - """ - Options for reading JSON files. - - Parameters - ---------- - use_threads : bool, optional (default True) - Whether to use multiple threads to accelerate reading - block_size : int, optional - How much bytes to process at a time from the input stream. - This will determine multi-threading granularity as well as - the size of individual chunks in the Table. - """ - - use_threads: bool - """ - Whether to use multiple threads to accelerate reading. - """ - block_size: int - """ - How much bytes to process at a time from the input stream. - - This will determine multi-threading granularity as well as the size of - individual chunks in the Table. - """ - def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... - def equals(self, other: ReadOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ReadOptions - - Returns - ------- - bool - """ - -class ParseOptions(_Weakrefable): - """ - Options for parsing JSON files. - - Parameters - ---------- - explicit_schema : Schema, optional (default None) - Optional explicit schema (no type inference, ignores other fields). - newlines_in_values : bool, optional (default False) - Whether objects may be printed across multiple lines (for example - pretty printed). If false, input must end with an empty line. - unexpected_field_behavior : str, default "infer" - How JSON fields outside of explicit_schema (if given) are treated. - - Possible behaviors: - - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output - """ - - explicit_schema: Schema - """ - Optional explicit schema (no type inference, ignores other fields) - """ - newlines_in_values: bool - """ - Whether newline characters are allowed in JSON values. - Setting this to True reduces the performance of multi-threaded - JSON reading. - """ - unexpected_field_behavior: Literal["ignore", "error", "infer"] - """ - How JSON fields outside of explicit_schema (if given) are treated. - - Possible behaviors: - - - "ignore": unexpected JSON fields are ignored - - "error": error out on unexpected JSON fields - - "infer": unexpected JSON fields are type-inferred and included in - the output - - Set to "infer" by default. - """ - def __init__( - self, - explicit_schema: Schema | None = None, - newlines_in_values: bool | None = None, - unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", - ): ... - def equals(self, other: ParseOptions) -> bool: - """ - Parameters - ---------- - other : pyarrow.json.ParseOptions - - Returns - ------- - bool - """ - -class JSONStreamingReader(RecordBatchReader): - """An object that reads record batches incrementally from a JSON file. - - Should not be instantiated directly by user code. - """ - -def read_json( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - memory_pool: MemoryPool | None = None, -) -> Table: - """ - Read a Table from a stream of JSON data. - - Parameters - ---------- - input_file : str, path or file-like object - The location of JSON data. Currently only the line-delimited JSON - format is supported. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see ReadOptions constructor for defaults). - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see ParseOptions constructor for defaults). - memory_pool : MemoryPool, optional - Pool to allocate Table memory from. - - Returns - ------- - :class:`pyarrow.Table` - Contents of the JSON file as a in-memory table. - """ - -def open_json( - input_file: StrPath | IO[Any], - read_options: ReadOptions | None = None, - parse_options: ParseOptions | None = None, - memory_pool: MemoryPool | None = None, -) -> JSONStreamingReader: - """ - Open a streaming reader of JSON data. - - Reading using this function is always single-threaded. - - Parameters - ---------- - input_file : string, path or file-like object - The location of JSON data. If a string or path, and if it ends - with a recognized compressed file extension (e.g. ".gz" or ".bz2"), - the data is automatically decompressed when reading. - read_options : pyarrow.json.ReadOptions, optional - Options for the JSON reader (see pyarrow.json.ReadOptions constructor - for defaults) - parse_options : pyarrow.json.ParseOptions, optional - Options for the JSON parser - (see pyarrow.json.ParseOptions constructor for defaults) - memory_pool : MemoryPool, optional - Pool to allocate RecordBatch memory from - - Returns - ------- - :class:`pyarrow.json.JSONStreamingReader` - """ diff --git a/pyarrow-stubs/_orc.pyi b/pyarrow-stubs/_orc.pyi deleted file mode 100644 index 71bf0dde9ba..00000000000 --- a/pyarrow-stubs/_orc.pyi +++ /dev/null @@ -1,56 +0,0 @@ -from typing import IO, Literal - -from .lib import ( - Buffer, - KeyValueMetadata, - MemoryPool, - NativeFile, - RecordBatch, - Schema, - Table, - _Weakrefable, -) - -class ORCReader(_Weakrefable): - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... - def metadata(self) -> KeyValueMetadata: ... - def schema(self) -> Schema: ... - def nrows(self) -> int: ... - def nstripes(self) -> int: ... - def file_version(self) -> str: ... - def software_version(self) -> str: ... - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... - def compression_size(self) -> int: ... - def row_index_stride(self) -> int: ... - def writer(self) -> str: ... - def writer_version(self) -> str: ... - def nstripe_statistics(self) -> int: ... - def content_length(self) -> int: ... - def stripe_statistics_length(self) -> int: ... - def file_footer_length(self) -> int: ... - def file_postscript_length(self) -> int: ... - def file_length(self) -> int: ... - def serialized_file_tail(self) -> int: ... - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... - def read(self, columns: list[str] | None = None) -> Table: ... - -class ORCWriter(_Weakrefable): - def open( - self, - where: str | NativeFile | IO, - *, - file_version: str | None = None, - batch_size: int | None = None, - stripe_size: int | None = None, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, - compression_block_size: int | None = None, - compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, - row_index_stride: int | None = None, - padding_tolerance: float | None = None, - dictionary_key_size_threshold: float | None = None, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float | None = None, - ) -> None: ... - def write(self, table: Table) -> None: ... - def close(self) -> None: ... diff --git a/pyarrow-stubs/_parquet.pyi b/pyarrow-stubs/_parquet.pyi deleted file mode 100644 index a9187df0428..00000000000 --- a/pyarrow-stubs/_parquet.pyi +++ /dev/null @@ -1,445 +0,0 @@ -from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict - -from _typeshed import StrPath - -from ._stubs_typing import Order -from .lib import ( - Buffer, - ChunkedArray, - KeyValueMetadata, - MemoryPool, - NativeFile, - RecordBatch, - Schema, - Table, - _Weakrefable, -) - -_PhysicalType: TypeAlias = Literal[ - "BOOLEAN", - "INT32", - "INT64", - "INT96", - "FLOAT", - "DOUBLE", - "BYTE_ARRAY", - "FIXED_LEN_BYTE_ARRAY", - "UNKNOWN", -] -_LogicTypeName: TypeAlias = Literal[ - "UNDEFINED", - "STRING", - "MAP", - "LIST", - "ENUM", - "DECIMAL", - "DATE", - "TIME", - "TIMESTAMP", - "INT", - "FLOAT16", - "JSON", - "BSON", - "UUID", - "NONE", - "UNKNOWN", -] -_ConvertedType: TypeAlias = Literal[ - "NONE", - "UTF8", - "MAP", - "MAP_KEY_VALUE", - "LIST", - "ENUM", - "DECIMAL", - "DATE", - "TIME_MILLIS", - "TIME_MICROS", - "TIMESTAMP_MILLIS", - "TIMESTAMP_MICROS", - "UINT_8", - "UINT_16", - "UINT_32", - "UINT_64", - "INT_8", - "INT_16", - "INT_32", - "INT_64", - "JSON", - "BSON", - "INTERVAL", - "UNKNOWN", -] -_Encoding: TypeAlias = Literal[ - "PLAIN", - "PLAIN_DICTIONARY", - "RLE", - "BIT_PACKED", - "DELTA_BINARY_PACKED", - "DELTA_LENGTH_BYTE_ARRAY", - "DELTA_BYTE_ARRAY", - "RLE_DICTIONARY", - "BYTE_STREAM_SPLIT", - "UNKNOWN", -] -_Compression: TypeAlias = Literal[ - "UNCOMPRESSED", - "SNAPPY", - "GZIP", - "LZO", - "BROTLI", - "LZ4", - "ZSTD", - "UNKNOWN", -] - -class _Statistics(TypedDict): - has_min_max: bool - min: Any | None - max: Any | None - null_count: int | None - distinct_count: int | None - num_values: int - physical_type: _PhysicalType - -class Statistics(_Weakrefable): - def to_dict(self) -> _Statistics: ... - def equals(self, other: Statistics) -> bool: ... - @property - def has_min_max(self) -> bool: ... - @property - def hash_null_count(self) -> bool: ... - @property - def has_distinct_count(self) -> bool: ... - @property - def min_raw(self) -> Any | None: ... - @property - def max_raw(self) -> Any | None: ... - @property - def min(self) -> Any | None: ... - @property - def max(self) -> Any | None: ... - @property - def null_count(self) -> int | None: ... - @property - def distinct_count(self) -> int | None: ... - @property - def num_values(self) -> int: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def logical_type(self) -> ParquetLogicalType: ... - @property - def converted_type(self) -> _ConvertedType | None: ... - -class ParquetLogicalType(_Weakrefable): - def to_json(self) -> str: ... - @property - def type(self) -> _LogicTypeName: ... - -class _ColumnChunkMetaData(TypedDict): - file_offset: int - file_path: str | None - physical_type: _PhysicalType - num_values: int - path_in_schema: str - is_stats_set: bool - statistics: Statistics | None - compression: _Compression - encodings: tuple[_Encoding, ...] - has_dictionary_page: bool - dictionary_page_offset: int | None - data_page_offset: int - total_compressed_size: int - total_uncompressed_size: int - -class ColumnChunkMetaData(_Weakrefable): - def to_dict(self) -> _ColumnChunkMetaData: ... - def equals(self, other: ColumnChunkMetaData) -> bool: ... - @property - def file_offset(self) -> int: ... - @property - def file_path(self) -> str | None: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def num_values(self) -> int: ... - @property - def path_in_schema(self) -> str: ... - @property - def is_stats_set(self) -> bool: ... - @property - def statistics(self) -> Statistics | None: ... - @property - def compression(self) -> _Compression: ... - @property - def encodings(self) -> tuple[_Encoding, ...]: ... - @property - def has_dictionary_page(self) -> bool: ... - @property - def dictionary_page_offset(self) -> int | None: ... - @property - def data_page_offset(self) -> int: ... - @property - def has_index_page(self) -> bool: ... - @property - def index_page_offset(self) -> int: ... - @property - def total_compressed_size(self) -> int: ... - @property - def total_uncompressed_size(self) -> int: ... - @property - def has_offset_index(self) -> bool: ... - @property - def has_column_index(self) -> bool: ... - @property - def metadata(self) -> dict[bytes, bytes] | None: ... - -class _SortingColumn(TypedDict): - column_index: int - descending: bool - nulls_first: bool - -class SortingColumn: - def __init__( - self, column_index: int, descending: bool = False, nulls_first: bool = False - ) -> None: ... - @classmethod - def from_ordering( - cls, - schema: Schema, - sort_keys: Sequence[tuple[str, Order]], - null_placement: Literal["at_start", "at_end"] = "at_end", - ) -> tuple[SortingColumn, ...]: ... - @staticmethod - def to_ordering( - schema: Schema, sorting_columns: tuple[SortingColumn, ...] - ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... - def __hash__(self) -> int: ... - @property - def column_index(self) -> int: ... - @property - def descending(self) -> bool: ... - @property - def nulls_first(self) -> bool: ... - def to_dict(self) -> _SortingColumn: ... - -class _RowGroupMetaData(TypedDict): - num_columns: int - num_rows: int - total_byte_size: int - columns: list[ColumnChunkMetaData] - sorting_columns: list[SortingColumn] - -class RowGroupMetaData(_Weakrefable): - def __init__(self, parent: FileMetaData, index: int) -> None: ... - def equals(self, other: RowGroupMetaData) -> bool: ... - def column(self, i: int) -> ColumnChunkMetaData: ... - def to_dict(self) -> _RowGroupMetaData: ... - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def total_byte_size(self) -> int: ... - @property - def sorting_columns(self) -> list[SortingColumn]: ... - -class _FileMetaData(TypedDict): - created_by: str - num_columns: int - num_rows: int - num_row_groups: int - format_version: str - serialized_size: int - -class FileMetaData(_Weakrefable): - def __hash__(self) -> int: ... - def to_dict(self) -> _FileMetaData: ... - def equals(self, other: FileMetaData) -> bool: ... - @property - def schema(self) -> ParquetSchema: ... - @property - def serialized_size(self) -> int: ... - @property - def num_columns(self) -> int: ... - @property - def num_rows(self) -> int: ... - @property - def num_row_groups(self) -> int: ... - @property - def format_version(self) -> str: ... - @property - def created_by(self) -> str: ... - @property - def metadata(self) -> dict[bytes, bytes] | None: ... - def row_group(self, i: int) -> RowGroupMetaData: ... - def set_file_path(self, path: str) -> None: ... - def append_row_groups(self, other: FileMetaData) -> None: ... - def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... - -class ParquetSchema(_Weakrefable): - def __init__(self, container: FileMetaData) -> None: ... - def __getitem__(self, i: int) -> ColumnChunkMetaData: ... - def __hash__(self) -> int: ... - def __len__(self) -> int: ... - @property - def names(self) -> list[str]: ... - def to_arrow_schema(self) -> Schema: ... - def equals(self, other: ParquetSchema) -> bool: ... - def column(self, i: int) -> ColumnSchema: ... - -class ColumnSchema(_Weakrefable): - def __init__(self, schema: ParquetSchema, index: int) -> None: ... - def equals(self, other: ColumnSchema) -> bool: ... - @property - def name(self) -> str: ... - @property - def path(self) -> str: ... - @property - def max_definition_level(self) -> int: ... - @property - def max_repetition_level(self) -> int: ... - @property - def physical_type(self) -> _PhysicalType: ... - @property - def logical_type(self) -> ParquetLogicalType: ... - @property - def converted_type(self) -> _ConvertedType | None: ... - @property - def length(self) -> int | None: ... - @property - def precision(self) -> int | None: ... - @property - def scale(self) -> int | None: ... - -class ParquetReader(_Weakrefable): - def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... - def open( - self, - source: StrPath | NativeFile | IO, - *, - use_memory_map: bool = False, - read_dictionary: Iterable[int] | Iterable[str] | None = None, - metadata: FileMetaData | None = None, - buffer_size: int = 0, - pre_buffer: bool = False, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, - ): ... - @property - def column_paths(self) -> list[str]: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def schema_arrow(self) -> Schema: ... - @property - def num_row_groups(self) -> int: ... - def set_use_threads(self, use_threads: bool) -> None: ... - def set_batch_size(self, batch_size: int) -> None: ... - def iter_batches( - self, - batch_size: int, - row_groups: list[int], - column_indices: list[int] | None = None, - use_threads: bool = True, - ) -> Iterator[RecordBatch]: ... - def read_row_group( - self, i: int, column_indices: list[int] | None = None, use_threads: bool = True - ) -> Table: ... - def read_row_groups( - self, - row_groups: list[int], - column_indices: list[int] | None = None, - use_threads: bool = True, - ) -> Table: ... - def read_all( - self, column_indices: list[int] | None = None, use_threads: bool = True - ) -> Table: ... - def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... - def column_name_idx(self, column_name: str) -> int: ... - def read_column(self, column_index: int) -> ChunkedArray: ... - def close(self) -> None: ... - @property - def closed(self) -> bool: ... - -class ParquetWriter(_Weakrefable): - def __init__( - self, - where: StrPath | NativeFile | IO, - schema: Schema, - use_dictionary: bool | list[str] | None = None, - compression: _Compression | dict[str, _Compression] | None = None, - version: str | None = None, - write_statistics: bool | list[str] | None = None, - memory_pool: MemoryPool | None = None, - use_deprecated_int96_timestamps: bool = False, - coerce_timestamps: Literal["ms", "us"] | None = None, - data_page_size: int | None = None, - allow_truncated_timestamps: bool = False, - compression_level: int | dict[str, int] | None = None, - use_byte_stream_split: bool | list[str] = False, - column_encoding: _Encoding | dict[str, _Encoding] | None = None, - writer_engine_version: str | None = None, - data_page_version: str | None = None, - use_compliant_nested_type: bool = True, - encryption_properties: FileDecryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: tuple[SortingColumn, ...] | None = None, - store_decimal_as_integer: bool = False, - ): ... - def close(self) -> None: ... - def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... - def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... - @property - def metadata(self) -> FileMetaData: ... - @property - def use_dictionary(self) -> bool | list[str] | None: ... - @property - def use_deprecated_int96_timestamps(self) -> bool: ... - @property - def use_byte_stream_split(self) -> bool | list[str]: ... - @property - def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... - @property - def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... - @property - def allow_truncated_timestamps(self) -> bool: ... - @property - def compression(self) -> _Compression | dict[str, _Compression] | None: ... - @property - def compression_level(self) -> int | dict[str, int] | None: ... - @property - def data_page_version(self) -> str | None: ... - @property - def use_compliant_nested_type(self) -> bool: ... - @property - def version(self) -> str | None: ... - @property - def write_statistics(self) -> bool | list[str] | None: ... - @property - def writer_engine_version(self) -> str: ... - @property - def row_group_size(self) -> int: ... - @property - def data_page_size(self) -> int: ... - @property - def encryption_properties(self) -> FileDecryptionProperties: ... - @property - def write_batch_size(self) -> int: ... - @property - def dictionary_pagesize_limit(self) -> int: ... - @property - def store_schema(self) -> bool: ... - @property - def store_decimal_as_integer(self) -> bool: ... - -class FileEncryptionProperties: ... -class FileDecryptionProperties: ... diff --git a/pyarrow-stubs/_parquet_encryption.pyi b/pyarrow-stubs/_parquet_encryption.pyi deleted file mode 100644 index c707edb844a..00000000000 --- a/pyarrow-stubs/_parquet_encryption.pyi +++ /dev/null @@ -1,67 +0,0 @@ -import datetime as dt - -from typing import Callable - -from ._parquet import FileDecryptionProperties, FileEncryptionProperties -from .lib import _Weakrefable - -class EncryptionConfiguration(_Weakrefable): - footer_key: str - column_keys: dict[str, list[str]] - encryption_algorithm: str - plaintext_footer: bool - double_wrapping: bool - cache_lifetime: dt.timedelta - internal_key_material: bool - data_key_length_bits: int - - def __init__( - self, - footer_key: str, - *, - column_keys: dict[str, str | list[str]] | None = None, - encryption_algorithm: str | None = None, - plaintext_footer: bool | None = None, - double_wrapping: bool | None = None, - cache_lifetime: dt.timedelta | None = None, - internal_key_material: bool | None = None, - data_key_length_bits: int | None = None, - ) -> None: ... - -class DecryptionConfiguration(_Weakrefable): - cache_lifetime: dt.timedelta - def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... - -class KmsConnectionConfig(_Weakrefable): - kms_instance_id: str - kms_instance_url: str - key_access_token: str - custom_kms_conf: dict[str, str] - def __init__( - self, - *, - kms_instance_id: str | None = None, - kms_instance_url: str | None = None, - key_access_token: str | None = None, - custom_kms_conf: dict[str, str] | None = None, - ) -> None: ... - def refresh_key_access_token(self, value: str) -> None: ... - -class KmsClient(_Weakrefable): - def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... - def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... - -class CryptoFactory(_Weakrefable): - def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... - def file_encryption_properties( - self, - kms_connection_config: KmsConnectionConfig, - encryption_config: EncryptionConfiguration, - ) -> FileEncryptionProperties: ... - def file_decryption_properties( - self, - kms_connection_config: KmsConnectionConfig, - decryption_config: DecryptionConfiguration | None = None, - ) -> FileDecryptionProperties: ... - def remove_cache_entries_for_token(self, access_token: str) -> None: ... - def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/pyarrow-stubs/_s3fs.pyi b/pyarrow-stubs/_s3fs.pyi deleted file mode 100644 index fc13c498bd9..00000000000 --- a/pyarrow-stubs/_s3fs.pyi +++ /dev/null @@ -1,74 +0,0 @@ -import enum - -from typing import Literal, NotRequired, Required, TypedDict - -from ._fs import FileSystem -from .lib import KeyValueMetadata - -class _ProxyOptions(TypedDict): - schema: Required[Literal["http", "https"]] - host: Required[str] - port: Required[int] - username: NotRequired[str] - password: NotRequired[str] - -class S3LogLevel(enum.IntEnum): - Off = enum.auto() - Fatal = enum.auto() - Error = enum.auto() - Warn = enum.auto() - Info = enum.auto() - Debug = enum.auto() - Trace = enum.auto() - -Off = S3LogLevel.Off -Fatal = S3LogLevel.Fatal -Error = S3LogLevel.Error -Warn = S3LogLevel.Warn -Info = S3LogLevel.Info -Debug = S3LogLevel.Debug -Trace = S3LogLevel.Trace - -def initialize_s3( - log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 -) -> None: ... -def ensure_s3_initialized() -> None: ... -def finalize_s3() -> None: ... -def ensure_s3_finalized() -> None: ... -def resolve_s3_region(bucket: str) -> str: ... - -class S3RetryStrategy: - max_attempts: int - def __init__(self, max_attempts=3) -> None: ... - -class AwsStandardS3RetryStrategy(S3RetryStrategy): ... -class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... - -class S3FileSystem(FileSystem): - def __init__( - self, - *, - access_key: str | None = None, - secret_key: str | None = None, - session_token: str | None = None, - anonymous: bool = False, - region: str | None = None, - request_timeout: float | None = None, - connect_timeout: float | None = None, - scheme: Literal["http", "https"] = "https", - endpoint_override: str | None = None, - background_writes: bool = True, - default_metadata: dict | KeyValueMetadata | None = None, - role_arn: str | None = None, - session_name: str | None = None, - external_id: str | None = None, - load_frequency: int = 900, - proxy_options: _ProxyOptions | str | None = None, - allow_bucket_creation: bool = False, - allow_bucket_deletion: bool = False, - check_directory_existence_before_creation: bool = False, - retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), - force_virtual_addressing: bool = False, - ): ... - @property - def region(self) -> str: ... diff --git a/pyarrow-stubs/_stubs_typing.pyi b/pyarrow-stubs/_stubs_typing.pyi deleted file mode 100644 index c259513f1ea..00000000000 --- a/pyarrow-stubs/_stubs_typing.pyi +++ /dev/null @@ -1,80 +0,0 @@ -import datetime as dt - -from collections.abc import Sequence -from decimal import Decimal -from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar - -import numpy as np - -from numpy.typing import NDArray - -from .compute import BooleanArray, IntegerArray - -ArrayLike: TypeAlias = Any -ScalarLike: TypeAlias = Any -Order: TypeAlias = Literal["ascending", "descending"] -JoinType: TypeAlias = Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", -] -Compression: TypeAlias = Literal[ - "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" -] -NullEncoding: TypeAlias = Literal["mask", "encode"] -NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] -Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray -Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray -PyScalar: TypeAlias = ( - bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta -) - -_T = TypeVar("_T") -SingleOrList: TypeAlias = list[_T] | _T - -class SupportEq(Protocol): - def __eq__(self, other) -> bool: ... - -class SupportLt(Protocol): - def __lt__(self, other) -> bool: ... - -class SupportGt(Protocol): - def __gt__(self, other) -> bool: ... - -class SupportLe(Protocol): - def __le__(self, other) -> bool: ... - -class SupportGe(Protocol): - def __ge__(self, other) -> bool: ... - -FilterTuple: TypeAlias = ( - tuple[str, Literal["=", "==", "!="], SupportEq] - | tuple[str, Literal["<"], SupportLt] - | tuple[str, Literal[">"], SupportGt] - | tuple[str, Literal["<="], SupportLe] - | tuple[str, Literal[">="], SupportGe] - | tuple[str, Literal["in", "not in"], Collection] -) - -class Buffer(Protocol): - def __buffer__(self, flags: int, /) -> memoryview: ... - -class SupportPyBuffer(Protocol): - def __buffer__(self, flags: int, /) -> memoryview: ... - -class SupportArrowStream(Protocol): - def __arrow_c_stream__(self, requested_schema=None) -> Any: ... - -class SupportArrowArray(Protocol): - def __arrow_c_array__(self, requested_schema=None) -> Any: ... - -class SupportArrowDeviceArray(Protocol): - def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... - -class SupportArrowSchema(Protocol): - def __arrow_c_schema(self) -> Any: ... diff --git a/pyarrow-stubs/_substrait.pyi b/pyarrow-stubs/_substrait.pyi deleted file mode 100644 index ff226e9521b..00000000000 --- a/pyarrow-stubs/_substrait.pyi +++ /dev/null @@ -1,39 +0,0 @@ -from typing import Any, Callable - -from ._compute import Expression -from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable - -def run_query( - plan: Buffer | int, - *, - table_provider: Callable[[list[str], Schema], Table] | None = None, - use_threads: bool = True, -) -> RecordBatchReader: ... -def _parse_json_plan(plan: bytes) -> Buffer: ... - -class SubstraitSchema: - schema: Schema - expression: Expression - def __init__(self, schema: Schema, expression: Expression) -> None: ... - def to_pysubstrait(self) -> Any: ... - -def serialize_schema(schema: Schema) -> SubstraitSchema: ... -def deserialize_schema(buf: Buffer | bytes) -> Schema: ... -def serialize_expressions( - exprs: list[Expression], - names: list[str], - schema: Schema, - *, - allow_arrow_extensions: bool = False, -) -> Buffer: ... - -class BoundExpressions(_Weakrefable): - @property - def schema(self) -> Schema: ... - @property - def expressions(self) -> dict[str, Expression]: ... - @classmethod - def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... - -def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... -def get_supported_functions() -> list[str]: ... diff --git a/pyarrow-stubs/acero.pyi b/pyarrow-stubs/acero.pyi deleted file mode 100644 index 8a520bdc24a..00000000000 --- a/pyarrow-stubs/acero.pyi +++ /dev/null @@ -1,85 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias -from typing import Literal - -from . import lib -from .compute import Expression, FunctionOptions - -_StrOrExpr: TypeAlias = str | Expression - -class Declaration(lib._Weakrefable): - def __init__( - self, - factory_name: str, - options: ExecNodeOptions, - inputs: list[Declaration] | None = None, - ) -> None: ... - @classmethod - def from_sequence(cls, decls: list[Declaration]) -> Self: ... - def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... - def to_table(self, use_threads: bool = True) -> lib.Table: ... - -class ExecNodeOptions(lib._Weakrefable): ... - -class TableSourceNodeOptions(ExecNodeOptions): - def __init__(self, table: lib.Table) -> None: ... - -class FilterNodeOptions(ExecNodeOptions): - def __init__(self, filter_expression: Expression) -> None: ... - -class ProjectNodeOptions(ExecNodeOptions): - def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... - -class AggregateNodeOptions(ExecNodeOptions): - def __init__( - self, - aggregates: list[tuple[list[str], str, FunctionOptions, str]], - keys: list[_StrOrExpr] | None = None, - ) -> None: ... - -class OrderByNodeOptions(ExecNodeOptions): - def __init__( - self, - sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), - *, - null_placement: Literal["at_start", "at_end"] = "at_end", - ) -> None: ... - -class HashJoinNodeOptions(ExecNodeOptions): - def __init__( - self, - join_type: Literal[ - "left semi", - "right semi", - "left anti", - "right anti", - "inner", - "left outer", - "right outer", - "full outer", - ], - left_keys: _StrOrExpr | list[_StrOrExpr], - right_keys: _StrOrExpr | list[_StrOrExpr], - left_output: list[_StrOrExpr] | None = None, - right_output: list[_StrOrExpr] | None = None, - output_suffix_for_left: str = "", - output_suffix_for_right: str = "", - ) -> None: ... - -class AsofJoinNodeOptions(ExecNodeOptions): - def __init__( - self, - left_on: _StrOrExpr, - left_by: _StrOrExpr | list[_StrOrExpr], - right_on: _StrOrExpr, - right_by: _StrOrExpr | list[_StrOrExpr], - tolerance: int, - ) -> None: ... diff --git a/pyarrow-stubs/benchmark.pyi b/pyarrow-stubs/benchmark.pyi deleted file mode 100644 index 048973301dc..00000000000 --- a/pyarrow-stubs/benchmark.pyi +++ /dev/null @@ -1,3 +0,0 @@ -from pyarrow.lib import benchmark_PandasObjectIsNull - -__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/pyarrow-stubs/cffi.pyi b/pyarrow-stubs/cffi.pyi deleted file mode 100644 index 2ae945c5974..00000000000 --- a/pyarrow-stubs/cffi.pyi +++ /dev/null @@ -1,4 +0,0 @@ -import cffi - -c_source: str -ffi: cffi.FFI diff --git a/pyarrow-stubs/compute.pyi b/pyarrow-stubs/compute.pyi deleted file mode 100644 index 8d8fc35b134..00000000000 --- a/pyarrow-stubs/compute.pyi +++ /dev/null @@ -1,7779 +0,0 @@ -# ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence -from collections.abc import Callable - -# Option classes -from pyarrow._compute import ArraySortOptions as ArraySortOptions -from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions -from pyarrow._compute import CastOptions as CastOptions -from pyarrow._compute import CountOptions as CountOptions -from pyarrow._compute import CumulativeOptions as CumulativeOptions -from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions -from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions -from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions -from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions - -# Expressions -from pyarrow._compute import Expression as Expression -from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions -from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions -from pyarrow._compute import FilterOptions as FilterOptions -from pyarrow._compute import Function as Function -from pyarrow._compute import FunctionOptions as FunctionOptions -from pyarrow._compute import FunctionRegistry as FunctionRegistry -from pyarrow._compute import HashAggregateFunction as HashAggregateFunction -from pyarrow._compute import HashAggregateKernel as HashAggregateKernel -from pyarrow._compute import IndexOptions as IndexOptions -from pyarrow._compute import JoinOptions as JoinOptions -from pyarrow._compute import Kernel as Kernel -from pyarrow._compute import ListFlattenOptions as ListFlattenOptions -from pyarrow._compute import ListSliceOptions as ListSliceOptions -from pyarrow._compute import MakeStructOptions as MakeStructOptions -from pyarrow._compute import MapLookupOptions as MapLookupOptions -from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions -from pyarrow._compute import ModeOptions as ModeOptions -from pyarrow._compute import NullOptions as NullOptions -from pyarrow._compute import PadOptions as PadOptions -from pyarrow._compute import PairwiseOptions as PairwiseOptions -from pyarrow._compute import PartitionNthOptions as PartitionNthOptions -from pyarrow._compute import PivotWiderOptions as PivotWiderOptions -from pyarrow._compute import QuantileOptions as QuantileOptions -from pyarrow._compute import RandomOptions as RandomOptions -from pyarrow._compute import RankOptions as RankOptions -from pyarrow._compute import RankQuantileOptions as RankQuantileOptions -from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions -from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions -from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions -from pyarrow._compute import RoundOptions as RoundOptions -from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions -from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions -from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions -from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction -from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel -from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions -from pyarrow._compute import ScalarFunction as ScalarFunction -from pyarrow._compute import ScalarKernel as ScalarKernel -from pyarrow._compute import SelectKOptions as SelectKOptions -from pyarrow._compute import SetLookupOptions as SetLookupOptions -from pyarrow._compute import SkewOptions as SkewOptions -from pyarrow._compute import SliceOptions as SliceOptions -from pyarrow._compute import SortOptions as SortOptions -from pyarrow._compute import SplitOptions as SplitOptions -from pyarrow._compute import SplitPatternOptions as SplitPatternOptions -from pyarrow._compute import StrftimeOptions as StrftimeOptions -from pyarrow._compute import StrptimeOptions as StrptimeOptions -from pyarrow._compute import StructFieldOptions as StructFieldOptions -from pyarrow._compute import TakeOptions as TakeOptions -from pyarrow._compute import TDigestOptions as TDigestOptions -from pyarrow._compute import TrimOptions as TrimOptions -from pyarrow._compute import UdfContext as UdfContext -from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions -from pyarrow._compute import VarianceOptions as VarianceOptions -from pyarrow._compute import VectorFunction as VectorFunction -from pyarrow._compute import VectorKernel as VectorKernel -from pyarrow._compute import WeekOptions as WeekOptions -from pyarrow._compute import WinsorizeOptions as WinsorizeOptions - -# Functions -from pyarrow._compute import call_function as call_function - -# Udf -from pyarrow._compute import call_tabular_function as call_tabular_function -from pyarrow._compute import function_registry as function_registry -from pyarrow._compute import get_function as get_function -from pyarrow._compute import list_functions as list_functions -from pyarrow._compute import register_aggregate_function as register_aggregate_function -from pyarrow._compute import register_scalar_function as register_scalar_function -from pyarrow._compute import register_tabular_function as register_tabular_function -from pyarrow._compute import register_vector_function as register_vector_function - -from pyarrow._compute import _Order, _Placement -from pyarrow._stubs_typing import ArrayLike, ScalarLike -from . import lib - -_P = ParamSpec("_P") -_R = TypeVar("_R") - -def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: - """Reference a column of the dataset. - - Stores only the field's name. Type and other information is known only when - the expression is bound to a dataset having an explicit scheme. - - Nested references are allowed by passing multiple names or a tuple of - names. For example ``('foo', 'bar')`` references the field named "bar" - inside the field named "foo". - - Parameters - ---------- - *name_or_index : string, multiple strings, tuple or int - The name or index of the (possibly nested) field the expression - references to. - - Returns - ------- - field_expr : Expression - Reference to the given field - - Examples - -------- - >>> import pyarrow.compute as pc - >>> pc.field("a") - - >>> pc.field(1) - - >>> pc.field(("a", "b")) - >> pc.field("a", "b") - Expression: - """Expression representing a scalar value. - - Creates an Expression object representing a scalar value that can be used - in compute expressions and predicates. - - Parameters - ---------- - value : bool, int, float or string - Python value of the scalar. This function accepts any value that can be - converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. - - Notes - ----- - This function differs from ``pyarrow.scalar()`` in the following way: - - * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents - a single value in Arrow's memory model. - * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing - a scalar value that can be used in compute expressions, predicates, and - dataset filtering operations. - - Returns - ------- - scalar_expr : Expression - An Expression representing the scalar value - """ - -def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... - -# ============= compute functions ============= -_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) -_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) -_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) -_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) -_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) -ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] -ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT - -SignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.Int8Type] - | lib.Scalar[lib.Int16Type] - | lib.Scalar[lib.Int32Type] - | lib.Scalar[lib.Int64Type] -) -UnsignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.UInt8Type] - | lib.Scalar[lib.UInt16Type] - | lib.Scalar[lib.Uint32Type] - | lib.Scalar[lib.UInt64Type] -) -IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar -FloatScalar: TypeAlias = ( - lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] -) -DecimalScalar: TypeAlias = ( - lib.Scalar[lib.Decimal32Type] - | lib.Scalar[lib.Decimal64Type] - | lib.Scalar[lib.Decimal128Type] - | lib.Scalar[lib.Decimal256Type] -) -NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar -NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar -BinaryScalar: TypeAlias = ( - lib.Scalar[lib.BinaryType] - | lib.Scalar[lib.LargeBinaryType] - | lib.Scalar[lib.FixedSizeBinaryType] -) -StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] -StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar -_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] -_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] -ListScalar: TypeAlias = ( - lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] -) -TemporalScalar: TypeAlias = ( - lib.Date32Scalar - | lib.Date64Scalar - | lib.Time32Scalar[Any] - | lib.Time64Scalar[Any] - | lib.TimestampScalar[Any] - | lib.DurationScalar[Any] - | lib.MonthDayNanoIntervalScalar -) -NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar -NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar - -_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) -NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] -_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) -_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) -NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] -_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) -NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] -_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) -BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] -_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) -IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] -_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) -FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] -_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) -_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) -StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] -_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) -_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) -BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] -_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) -_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) -StringOrBinaryArray: TypeAlias = StringArray | BinaryArray -_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) -_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) -TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] -_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) -_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] -_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] -ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] -# =============================== 1. Aggregation =============================== - -# ========================= 1.1 functions ========================= - -def all( - array: lib.BooleanScalar | BooleanArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: - """ - Test whether all elements in a boolean array evaluate to true. - - Null values are ignored by default. - If the `skip_nulls` option is set to false, then Kleene logic is used. - See "kleene_and" for more details on Kleene logic. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -any = _clone_signature(all) -""" -Test whether any element in a boolean array evaluates to true. - -Null values are ignored by default. -If the `skip_nulls` option is set to false, then Kleene logic is used. -See "kleene_or" for more details on Kleene logic. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def approximate_median( - array: NumericScalar | NumericArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Approximate median of a numeric array with T-Digest algorithm. - - Nulls and NaNs are ignored. - A null scalar is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def count( - array: lib.Array | lib.ChunkedArray, - /, - mode: Literal["only_valid", "only_null", "all"] = "only_valid", - *, - options: CountOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of null / non-null values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def count_distinct( - array: lib.Array | lib.ChunkedArray, - /, - mode: Literal["only_valid", "only_null", "all"] = "only_valid", - *, - options: CountOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Count the number of unique values. - - By default, only non-null values are counted. - This can be changed through CountOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - mode : str, default "only_valid" - Which values to count in the input. - Accepted values are "only_valid", "only_null", "all". - options : pyarrow.compute.CountOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def first( - array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the first value in each group. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def first_last( - array: lib.Array[Any] | lib.ChunkedArray[Any], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: - """ - Compute the first and last values of an array. - - Null values are ignored by default. - If skip_nulls = false, then this will return the first and last values - regardless if it is null - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def index( - data: lib.Array[Any] | lib.ChunkedArray[Any], - value, - start: int | None = None, - end: int | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: - """ - Find the index of the first occurrence of a given value. - - Parameters - ---------- - data : Array-like - value : Scalar-like object - The value to search for. - start : int, optional - end : int, optional - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - index : int - the index, or -1 if not found - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) - >>> pc.index(arr, "ipsum") - - >>> pc.index(arr, "ipsum", start=2) - - >>> pc.index(arr, "amet") - - """ - -last = _clone_signature(first) -""" -Compute the first and last values of an array. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True -In [15]: print(pc.last.__doc__) -Compute the first value in each group. - -Null values are ignored by default. -If skip_nulls = false, then this will return the first and last values -regardless if it is null - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -max = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -min = _clone_signature(first) -""" -Compute the minimum or maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -min_max = _clone_signature(first_last) -""" -Compute the minimum and maximum values of a numeric array. - -Null values are ignored by default. -This can be changed through ScalarAggregateOptions. - -Parameters ----------- -array : Array-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. -options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def mean( - array: FloatScalar | FloatArray, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: ... -@overload -def mean( - array: lib.NumericArray[lib.Decimal128Scalar] - | lib.ChunkedArray[lib.Decimal128Scalar] - | lib.Decimal128Scalar, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Decimal128Scalar: ... -@overload -def mean( - array: lib.NumericArray[lib.Decimal256Scalar] - | lib.ChunkedArray[lib.Decimal256Scalar] - | lib.Decimal256Scalar, - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Decimal256Scalar: ... -def mean(*args, **kwargs): - """ - Compute the mean of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - The result is a double for integer and floating point arguments, - and a decimal with the same bit-width/precision/scale for decimal arguments. - For integers and floats, NaN is returned if min_count = 0 and - there are no values. For decimals, null is returned instead. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def mode( - array: NumericScalar | NumericArray, - /, - n: int = 1, - *, - skip_nulls: bool = True, - min_count: int = 0, - options: ModeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: - """ - Compute the modal (most common) values of a numeric array. - - Compute the n most common values and their respective occurrence counts. - The output has type `struct`, where T is the - input type. - The results are ordered by descending `count` first, and ascending `mode` - when breaking ties. - Nulls are ignored. If there are no non-null values in the array, - an empty array is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - n : int, default 1 - Number of distinct most-common values to return. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ModeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) - >>> modes = pc.mode(arr, 2) - >>> modes[0] - - >>> modes[1] - - """ - -def product( - array: _ScalarT | lib.NumericArray[_ScalarT], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarT: - """ - Compute the product of values in a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def quantile( - array: NumericScalar | NumericArray, - /, - q: float = 0.5, - *, - interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", - skip_nulls: bool = True, - min_count: int = 0, - options: QuantileOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Compute an array of quantiles of a numeric array or chunked array. - - By default, 0.5 quantile (median) is returned. - If quantile lies between two data points, an interpolated value is - returned based on selected interpolation method. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to compute. All values must be in - [0, 1]. - interpolation : str, default "linear" - How to break ties between competing data points for a given quantile. - Accepted values are: - - - "linear": compute an interpolation - - "lower": always use the smallest of the two data points - - "higher": always use the largest of the two data points - - "nearest": select the data point that is closest to the quantile - - "midpoint": compute the (unweighted) mean of the two data points - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.QuantileOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def stddev( - array: NumericScalar | NumericArray, - /, - *, - ddof: float = 0, - skip_nulls: bool = True, - min_count: int = 0, - options: VarianceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the standard deviation of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population standard deviation is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def sum( - array: _NumericScalarT | NumericArray[_NumericScalarT], - /, - *, - skip_nulls: bool = True, - min_count: int = 1, - options: ScalarAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: - """ - Compute the sum of a numeric array. - - Null values are ignored by default. Minimum count of non-null - values can be set and null is returned if too few are present. - This can be changed through ScalarAggregateOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 1 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.ScalarAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def tdigest( - array: NumericScalar | NumericArray, - /, - q: float = 0.5, - *, - delta: int = 100, - buffer_size: int = 500, - skip_nulls: bool = True, - min_count: int = 0, - options: TDigestOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Approximate quantiles of a numeric array with T-Digest algorithm. - - By default, 0.5 quantile (median) is returned. - Nulls and NaNs are ignored. - An array of nulls is returned if there is no valid data point. - - Parameters - ---------- - array : Array-like - Argument to compute function. - q : double or sequence of double, default 0.5 - Probability levels of the quantiles to approximate. All values must be - in [0, 1]. - delta : int, default 100 - Compression parameter for the T-digest algorithm. - buffer_size : int, default 500 - Buffer size for the T-digest algorithm. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.TDigestOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -def variance( - array: NumericScalar | NumericArray, - /, - *, - ddof: int = 0, - skip_nulls: bool = True, - min_count: int = 0, - options: VarianceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleScalar: - """ - Calculate the variance of a numeric array. - - The number of degrees of freedom can be controlled using VarianceOptions. - By default (`ddof` = 0), the population variance is calculated. - Nulls are ignored. If there are not enough non-null values in the array - to satisfy `ddof`, null is returned. - - Parameters - ---------- - array : Array-like - Argument to compute function. - ddof : int, default 0 - Number of degrees of freedom. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - min_count : int, default 0 - Minimum number of non-null values in the input. If the number - of non-null values is below `min_count`, the output is null. - options : pyarrow.compute.VarianceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def top_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - k: int, - sort_keys: list | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the top-k ordered elements from array- or table-like - data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get top indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array - Indices of the top-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.top_k_unstable(arr, k=3) - - [ - 5, - 4, - 2 - ] - """ - -def bottom_k_unstable( - values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - k: int, - sort_keys: list | None = None, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array: - """ - Select the indices of the bottom-k ordered elements from - array- or table-like data. - - This is a specialization for :func:`select_k_unstable`. Output is not - guaranteed to be stable. - - Parameters - ---------- - values : Array, ChunkedArray, RecordBatch, or Table - Data to sort and get bottom indices from. - k : int - The number of `k` elements to keep. - sort_keys : List-like - Column key names to order by when input is table-like data. - memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - Returns - ------- - result : Array of indices - Indices of the bottom-k ordered elements - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.compute as pc - >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) - >>> pc.bottom_k_unstable(arr, k=3) - - [ - 0, - 1, - 2 - ] - """ - -# ========================= 2. Element-wise (“scalar”) functions ========================= - -# ========================= 2.1 Arithmetic ========================= -@overload -def abs( - x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT: ... -@overload -def abs( - x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationArrayT: ... -@overload -def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def abs(*args, **kwargs): - """ - Calculate the absolute value of the argument element-wise. - - Results will wrap around on integer overflow. - Use function "abs_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -abs_checked = _clone_signature(abs) -""" -Calculate the absolute value of the argument element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "abs". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def add( - x: _NumericOrTemporalScalarT, - y: _NumericOrTemporalScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT: ... -@overload -def add( - x: _NumericOrTemporalArrayT, - y: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def add( - x: NumericOrTemporalScalar, - y: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: _NumericOrTemporalArrayT, - y: NumericOrTemporalScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def add( - x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def add( - x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def add(*args, **kwargs): - """ - Add the arguments element-wise. - - Results will wrap around on integer overflow. - Use function "add_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -add_checked = _clone_signature(add) -""" -Add the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "add". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - -""" - -@overload -def divide( - dividend: _NumericOrTemporalScalarT, - divisor: _NumericOrTemporalScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalScalarT: ... -@overload -def divide( - dividend: _NumericOrTemporalArrayT, - divisor: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: Expression, - divisor: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def divide( - dividend: NumericOrTemporalScalar, - divisor: _NumericOrTemporalArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: _NumericOrTemporalArrayT, - divisor: NumericOrTemporalScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def divide( - dividend: NumericOrTemporalScalar, - divisor: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def divide( - dividend: Expression, - divisor: NumericOrTemporalScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def divide(*args, **kwargs): - """ - Divide the arguments element-wise. - - Integer division by zero returns an error. However, integer overflow - wraps around, and floating-point division by zero returns an infinite. - Use function "divide_checked" if you want to get an error - in all the aforementioned cases. - - Parameters - ---------- - dividend : Array-like or scalar-like - Argument to compute function. - divisor : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -divide_checked = _clone_signature(divide) -""" -Divide the arguments element-wise. - -An error is returned when trying to divide by zero, or when -integer overflow is encountered. - -Parameters ----------- -dividend : Array-like or scalar-like - Argument to compute function. -divisor : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def exp( - exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _FloatArrayT: ... -@overload -def exp( - exponent: ArrayOrChunkedArray[NonFloatNumericScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: ... -@overload -def exp( - exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _FloatScalarT: ... -@overload -def exp( - exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.DoubleScalar: ... -@overload -def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def exp(*args, **kwargs): - """ - Compute Euler's number raised to the power of specified exponent, element-wise. - - If exponent is null the result will be null. - - Parameters - ---------- - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -multiply = _clone_signature(add) -""" -Multiply the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "multiply_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -multiply_checked = _clone_signature(add) -""" -Multiply the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "multiply". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def negate( - x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationT: ... -@overload -def negate( - x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericOrDurationArrayT: ... -@overload -def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def negate(*args, **kwargs): - """ - Negate the argument element-wise. - - Results will wrap around on integer overflow. - Use function "negate_checked" if you want overflow - to return an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -negate_checked = _clone_signature(negate) -""" -Negate the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "negate". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def power( - base: _NumericScalarT, - exponent: _NumericScalarT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def power( - base: _NumericArrayT, - exponent: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: Expression, - exponent: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def power( - base: _NumericArrayT, - exponent: NumericScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: NumericScalar, - exponent: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def power( - base: NumericScalar, - exponent: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def power( - base: Expression, - exponent: NumericScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def power(*args, **kwargs): - """ - Raise arguments to power element-wise. - - Integer to negative integer power returns an error. However, integer overflow - wraps around. If either base or exponent is null the result will be null. - - Parameters - ---------- - base : Array-like or scalar-like - Argument to compute function. - exponent : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -power_checked = _clone_signature(power) -""" -Raise arguments to power element-wise. - -An error is returned when integer to negative integer power is encountered, -or integer overflow is encountered. - -Parameters ----------- -base : Array-like or scalar-like - Argument to compute function. -exponent : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def sign( - x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> ( - lib.NumericArray[lib.Int8Scalar] - | lib.NumericArray[lib.FloatScalar] - | lib.NumericArray[lib.DoubleScalar] -): ... -@overload -def sign( - x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... -@overload -def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def sign(*args, **kwargs): - """ - Get the signedness of the arguments element-wise. - - Output is any of (-1,1) for nonzero inputs and 0 for zero input. - NaN values return NaN. Integral values return signedness as Int8 and - floating-point values return it with the same type as the input values. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -@overload -def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... -@overload -def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... -@overload -def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def sqrt(*args, **kwargs): - """ - Takes the square root of arguments element-wise. - - A negative argument returns a NaN. For a variant that returns an - error, use function "sqrt_checked". - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - - """ - -sqrt_checked = _clone_signature(sqrt) -""" -Takes the square root of arguments element-wise. - -A negative argument returns an error. For a variant that returns a -NaN, use function "sqrt". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -subtract = _clone_signature(add) -""" -Subtract the arguments element-wise. - -Results will wrap around on integer overflow. -Use function "subtract_checked" if you want overflow -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -subtract_checked = _clone_signature(add) -""" -Subtract the arguments element-wise. - -This function returns an error on overflow. For a variant that -doesn't fail on overflow, use function "subtract". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.1 Bit-wise functions ========================= -@overload -def bit_wise_and( - x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT: ... -@overload -def bit_wise_and( - x: _NumericArrayT, - y: _NumericArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_and( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def bit_wise_and( - x: Expression, - y: NumericScalar | ArrayOrChunkedArray[NumericScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def bit_wise_and( - x: NumericScalar | ArrayOrChunkedArray[NumericScalar], - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def bit_wise_and(*args, **kwargs): - """ - Bit-wise AND the arguments element-wise. - - Null values return null. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def bit_wise_not( - x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericScalarT: ... -@overload -def bit_wise_not( - x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _NumericArrayT: ... -@overload -def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def bit_wise_not(*args, **kwargs): - """ - Bit-wise negate the arguments element-wise. - - Null values return null. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -bit_wise_or = _clone_signature(bit_wise_and) -""" -Bit-wise OR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -bit_wise_xor = _clone_signature(bit_wise_and) -""" -Bit-wise XOR the arguments element-wise. - -Null values return null. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_left = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -`x` is returned if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_left_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_left_checked = _clone_signature(bit_wise_and) -""" -Left shift `x` by `y`. - -The shift operates as if on the two's complement representation of the number. -In other words, this is equivalent to multiplying `x` by 2 to the power `y`, -even if overflow occurs. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_left" for a variant that doesn't fail for an invalid shift amount. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_right = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -`x` is returned if `y` (the amount to shift by) is: (1) negative or -(2) greater than or equal to the precision of `x`. -Use function "shift_right_checked" if you want an invalid shift amount -to return an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -shift_right_checked = _clone_signature(bit_wise_and) -""" -Right shift `x` by `y`. - -This is equivalent to dividing `x` by 2 to the power `y`. -An error is raised if `y` (the amount to shift by) is (1) negative or -(2) greater than or equal to the precision of `x`. -See "shift_right" for a variant that doesn't fail for an invalid shift amount - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.2 Rounding functions ========================= -@overload -def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... -@overload -def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... -@overload -def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def ceil(*args, **kwargs): - """ - Round up to the nearest integer. - - Compute the smallest integer value not less in magnitude than `x`. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -floor = _clone_signature(ceil) -""" -Round down to the nearest integer. - -Compute the largest integer value not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def round( - x: _NumericScalarT, - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round( - x: _NumericArrayT, - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round( - x: Expression, - /, - ndigits: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round(*args, **kwargs): - """ - Round to a given precision. - - Options are used to control the number of digits and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - ndigits : int, default 0 - Number of fractional digits to round to. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def round_to_multiple( - x: _NumericScalarT, - /, - multiple: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round_to_multiple( - x: _NumericArrayT, - /, - multiple: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round_to_multiple( - x: Expression, - /, - multiple: int = 0, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundToMultipleOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round_to_multiple(*args, **kwargs): - """ - Round to a given multiple. - - Options are used to control the rounding multiple and rounding mode. - Default behavior is to round to the nearest integer and - use half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - multiple : numeric scalar, default 1.0 - Multiple to round to. Should be a scalar of a type compatible - with the argument to be rounded. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundToMultipleOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def round_binary( - x: _NumericScalarT, - s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericScalarT: ... -@overload -def round_binary( - x: _NumericScalarT, - s: Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[_NumericScalarT]: ... -@overload -def round_binary( - x: _NumericArrayT, - s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def round_binary( - x: Expression, - s: Iterable, - /, - round_mode: Literal[ - "down", - "up", - "towards_zero", - "towards_infinity", - "half_down", - "half_up", - "half_towards_zero", - "half_towards_infinity", - "half_to_even", - "half_to_odd", - ] = "half_to_even", - *, - options: RoundBinaryOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def round_binary(*args, **kwargs): - """ - Round to the given precision. - - Options are used to control the rounding mode. - Default behavior is to use the half-to-even rule to break ties. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - s : Array-like or scalar-like - Argument to compute function. - round_mode : str, default "half_to_even" - Rounding and tie-breaking mode. - Accepted values are "down", "up", "towards_zero", "towards_infinity", - "half_down", "half_up", "half_towards_zero", "half_towards_infinity", - "half_to_even", "half_to_odd". - options : pyarrow.compute.RoundBinaryOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -trunc = _clone_signature(ceil) -""" -Compute the integral part. - -Compute the nearest integer not greater in magnitude than `x`. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.3 Logarithmic functions ========================= -@overload -def ln( - x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def ln( - x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def ln(*args, **kwargs): - """ - Compute natural logarithm. - - Non-positive values return -inf or NaN. Null values return null. - Use function "ln_checked" if you want non-positive values to raise an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ln_checked = _clone_signature(ln) -""" -Compute natural logarithm. - -Non-positive values raise an error. Null values return null. -Use function "ln" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log10 = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log10_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log10_checked = _clone_signature(ln) -""" -Compute base 10 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log10" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log1p = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p_checked" if you want invalid values to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log1p_checked = _clone_signature(ln) -""" -Compute natural log of (1+x). - -Values <= -1 return -inf or NaN. Null values return null. -This function may be more precise than log(1 + x) for x close to zero. -Use function "log1p" if you want invalid values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log2 = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values return -inf or NaN. Null values return null. -Use function "log2_checked" if you want non-positive values -to raise an error. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -log2_checked = _clone_signature(ln) -""" -Compute base 2 logarithm. - -Non-positive values raise an error. Null values return null. -Use function "log2" if you want non-positive values -to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def logb( - x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def logb( - x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: FloatScalar, - b: FloatArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: FloatArray, - b: FloatScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def logb( - x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression | Any: ... -def logb(*args, **kwargs): - """ - Compute base `b` logarithm. - - Values <= 0 return -inf or NaN. Null values return null. - Use function "logb_checked" if you want non-positive values to raise an error. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - b : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -logb_checked = _clone_signature(logb) -""" -Compute base `b` logarithm. - -Values <= 0 return -inf or NaN. Null values return null. -Use function "logb" if you want non-positive values to return -inf or NaN. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -b : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.4 Trigonometric functions ========================= -acos = _clone_signature(ln) -""" -Compute the inverse cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "acos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -acos_checked = _clone_signature(ln) -""" -Compute the inverse cosine. - -Invalid input values raise an error; -to return NaN instead, see "acos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -asin = _clone_signature(ln) -""" -Compute the inverse sine. - -NaN is returned for invalid input values; -to raise an error instead, see "asin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -asin_checked = _clone_signature(ln) -""" -Compute the inverse sine. - -Invalid input values raise an error; -to return NaN instead, see "asin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -atan = _clone_signature(ln) -""" -Compute the inverse tangent of x. - -The return value is in the range [-pi/2, pi/2]; -for a full return range [-pi, pi], see "atan2". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cos = _clone_signature(ln) -""" -Compute the cosine. - -NaN is returned for invalid input values; -to raise an error instead, see "cos_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cos_checked = _clone_signature(ln) -""" -Compute the cosine. - -Infinite values raise an error; -to return NaN instead, see "cos". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -sin = _clone_signature(ln) -""" -Compute the sine. - -NaN is returned for invalid input values; -to raise an error instead, see "sin_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -sin_checked = _clone_signature(ln) -""" -Compute the sine. - -Invalid input values raise an error; -to return NaN instead, see "sin". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -tan = _clone_signature(ln) -""" -Compute the tangent. - -NaN is returned for invalid input values; -to raise an error instead, see "tan_checked". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -tan_checked = _clone_signature(ln) -""" -Compute the tangent. - -Infinite values raise an error; -to return NaN instead, see "tan". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def atan2( - y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.FloatScalar | lib.DoubleScalar: ... -@overload -def atan2( - y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: FloatArray, - x: FloatScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: FloatScalar, - x: FloatArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... -@overload -def atan2( - y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def atan2( - y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def atan2(*args, **kwargs): - """ - Compute the inverse tangent of y/x. - - The return value is in the range [-pi, pi]. - - Parameters - ---------- - y : Array-like or scalar-like - Argument to compute function. - x : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.5 Comparisons functions ========================= -@overload -def equal( - x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def equal( - x: lib.Scalar, - y: lib.Array | lib.ChunkedArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: lib.Array | lib.ChunkedArray, - y: lib.Scalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: lib.Array | lib.ChunkedArray, - y: lib.Array | lib.ChunkedArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def equal( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def equal( - x: lib.Scalar, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def equal( - x: Expression, - y: lib.Scalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def equal(*args, **kwargs): - """ - Compare values for equality (x == y). - - A null on either side emits a null comparison result. - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -greater = _clone_signature(equal) -""" -Compare values for ordered inequality (x > y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -greater_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x >= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -less = _clone_signature(equal) -""" -Compare values for ordered inequality (x < y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -less_equal = _clone_signature(equal) -""" -Compare values for ordered inequality (x <= y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -not_equal = _clone_signature(equal) -""" -Compare values for inequality (x != y). - -A null on either side emits a null comparison result. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def max_element_wise( - *args: ScalarOrArray[_Scalar_CoT], - skip_nulls: bool = True, - options: ElementWiseAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _Scalar_CoT: ... -@overload -def max_element_wise( - *args: Expression, - skip_nulls: bool = True, - options: ElementWiseAggregateOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def max_element_wise(*args, **kwargs): - """ - Find the element-wise maximum value. - - Nulls are ignored (by default) or propagated. - NaN is preferred over null, but not over any valid value. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. - options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -min_element_wise = _clone_signature(max_element_wise) -""" -Find the element-wise minimum value. - -Nulls are ignored (by default) or propagated. -NaN is preferred over null, but not over any valid value. - -Parameters ----------- -*args : Array-like or scalar-like - Argument to compute function. -skip_nulls : bool, default True - Whether to skip (ignore) nulls in the input. - If False, any null in the input forces the output to null. -options : pyarrow.compute.ElementWiseAggregateOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.6 Logical functions ========================= -@overload -def and_( - x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def and_( - x: BooleanArray, - y: BooleanArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: Expression, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: lib.BooleanScalar, - y: BooleanArray, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: BooleanArray, - y: lib.BooleanScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def and_( - x: lib.BooleanScalar, - y: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: Expression, - y: lib.BooleanScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def and_( - x: ScalarOrArray[lib.BooleanScalar], - y: ScalarOrArray[lib.BooleanScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> ScalarOrArray[lib.BooleanScalar]: ... -def and_(*args, **kwargs): - """ - Logical 'and' boolean values. - - When a null is encountered in either input, a null is output. - For a different null behavior, see function "and_kleene". - - Parameters - ---------- - x : Array-like or scalar-like - Argument to compute function. - y : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -and_kleene = _clone_signature(and_) -""" -Logical 'and' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and null = null -- null and true = null -- false and null = false -- null and false = false -- null and null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and' false is always false. -For a different null behavior, see function "and". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -and_not = _clone_signature(and_) -""" -Logical 'and not' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "and_not_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -and_not_kleene = _clone_signature(and_) -""" -Logical 'and not' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true and not null = null -- null and not false = null -- false and not null = false -- null and not true = false -- null and not null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'and not' true is always false, as is false -'and not' an unknown value. -For a different null behavior, see function "and_not". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -or_ = _clone_signature(and_) -""" -Logical 'or' boolean values. - -When a null is encountered in either input, a null is output. -For a different null behavior, see function "or_kleene". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -or_kleene = _clone_signature(and_) -""" -Logical 'or' boolean values (Kleene logic). - -This function behaves as follows with nulls: - -- true or null = true -- null or true = true -- false or null = null -- null or false = null -- null or null = null - -In other words, in this context a null value really means "unknown", -and an unknown value 'or' true is always true. -For a different null behavior, see function "or". - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -xor = _clone_signature(and_) -""" -Logical 'xor' boolean values. - -When a null is encountered in either input, a null is output. - -Parameters ----------- -x : Array-like or scalar-like - Argument to compute function. -y : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def invert( - x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def invert( - x: _BooleanArrayT, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _BooleanArrayT: ... -@overload -def invert( - x: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def invert(*args, **kwargs): - """ - Invert boolean values. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.10 String predicates ========================= -@overload -def ascii_is_alnum( - strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def ascii_is_alnum( - strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def ascii_is_alnum( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def ascii_is_alnum(*args, **kwargs): - """ - Classify strings as ASCII alphanumeric. - - For each string in `strings`, emit true iff the string is non-empty - and consists only of alphanumeric ASCII characters. Null strings emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_alnum = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphanumeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphanumeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_alpha = _clone_signature(ascii_is_alnum) -""" -Classify strings as alphabetic. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of alphabetic Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_decimal = _clone_signature(ascii_is_alnum) -""" -Classify strings as decimal. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of decimal Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_digit = _clone_signature(ascii_is_alnum) -""" -Classify strings as digits. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of Unicode digits. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_lower = _clone_signature(ascii_is_alnum) -""" -Classify strings as lowercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of lowercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_numeric = _clone_signature(ascii_is_alnum) -""" -Classify strings as numeric. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of numeric Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_printable = _clone_signature(ascii_is_alnum) -""" -Classify strings as printable. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of printable Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_space = _clone_signature(ascii_is_alnum) -""" -Classify strings as whitespace. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of whitespace Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_upper = _clone_signature(ascii_is_alnum) -""" -Classify strings as uppercase. - -For each string in `strings`, emit true iff the string is non-empty -and consists only of uppercase Unicode characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_is_title = _clone_signature(ascii_is_alnum) -""" -Classify strings as titlecase. - -For each string in `strings`, emit true iff the string is title-cased, -i.e. it has at least one cased character, each uppercase character -follows an uncased character, and each lowercase character follows -an uppercase character. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -string_is_ascii = _clone_signature(ascii_is_alnum) -""" -Classify strings as ASCII. - -For each string in `strings`, emit true iff the string consists only -of ASCII characters. Null strings emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.11 String transforms ========================= -@overload -def ascii_capitalize( - strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT: ... -@overload -def ascii_capitalize( - strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringArrayT: ... -@overload -def ascii_capitalize( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def ascii_capitalize(*args, **kwargs): - """ - Capitalize the first character of ASCII input. - - For each string in `strings`, return a capitalized version. - - This function assumes the input is fully ASCII. If it may contain - non-ASCII characters, use "utf8_capitalize" instead. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_lower = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to lowercase. - -For each string in `strings`, return a lowercase version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_lower" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_reverse = _clone_signature(ascii_capitalize) -""" -Reverse ASCII input. - -For each ASCII string in `strings`, return a reversed version. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_reverse" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_swapcase = _clone_signature(ascii_capitalize) -""" -Transform ASCII input by inverting casing. - -For each string in `strings`, return a string with opposite casing. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_swapcase" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_title = _clone_signature(ascii_capitalize) -""" -Titlecase each word of ASCII input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -This function assumes the input is fully ASCII. If it may contain -non-ASCII characters, use "utf8_title" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_upper = _clone_signature(ascii_capitalize) -""" -Transform ASCII input to uppercase. - -For each string in `strings`, return an uppercase version. - -This function assumes the input is fully ASCII. It it may contain -non-ASCII characters, use "utf8_upper" instead. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def binary_length( - strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar: ... -@overload -def binary_length( - strings: lib.LargeBinaryScalar | lib.LargeStringScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def binary_length( - strings: lib.BinaryArray - | lib.StringArray - | lib.ChunkedArray[lib.BinaryScalar] - | lib.ChunkedArray[lib.StringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def binary_length( - strings: lib.LargeBinaryArray - | lib.LargeStringArray - | lib.ChunkedArray[lib.LargeBinaryScalar] - | lib.ChunkedArray[lib.LargeStringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def binary_length( - strings: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_length(*args, **kwargs): - """ - Compute string lengths. - - For each string in `strings`, emit its length of bytes. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_repeat( - strings: _StringOrBinaryScalarT, - num_repeats: int, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... -@overload -def binary_repeat( - strings: _StringOrBinaryScalarT, - num_repeats: list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[_StringOrBinaryScalarT]: ... -@overload -def binary_repeat( - strings: _StringOrBinaryArrayT, - num_repeats: int | list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_repeat( - strings: Expression, - num_repeats: int | list[int] | list[int | None], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_repeat(*args, **kwargs): - """ - Repeat a binary string. - - For each binary string in `strings`, return a replicated version. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - num_repeats : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_replace_slice( - strings: _StringOrBinaryScalarT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... -@overload -def binary_replace_slice( - strings: _StringOrBinaryArrayT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_replace_slice( - strings: Expression, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_replace_slice(*args, **kwargs): - """ - Replace a slice of a binary string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in bytes. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_reverse( - strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _BinaryScalarT: ... -@overload -def binary_reverse( - strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _BinaryArrayT: ... -@overload -def binary_reverse( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def binary_reverse(*args, **kwargs): - """ - Reverse binary input. - - For each binary string in `strings`, return a reversed version. - - This function reverses the binary data at a byte-level. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def replace_substring( - strings: _StringScalarT, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def replace_substring( - strings: _StringArrayT, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def replace_substring( - strings: Expression, - /, - pattern: str | bytes, - replacement: str | bytes, - *, - max_replacements: int | None = None, - options: ReplaceSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def replace_substring(*args, **kwargs): - """ - Replace matching non-overlapping substrings with replacement. - - For each string in `strings`, replace non-overlapping substrings that match - the given literal `pattern` with the given `replacement`. - If `max_replacements` is given and not equal to -1, it limits the - maximum amount replacements per input, counted from the left. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - replacement : str - What to replace the pattern with. - max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). - options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -replace_substring_regex = _clone_signature(replace_substring) -""" -Replace matching non-overlapping substrings with replacement. - -For each string in `strings`, replace non-overlapping substrings that match -the given regular expression `pattern` with the given `replacement`. -If `max_replacements` is given and not equal to -1, it limits the -maximum amount replacements per input, counted from the left. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -replacement : str - What to replace the pattern with. -max_replacements : int or None, default None - The maximum number of strings to replace in each - input value (unlimited if None). -options : pyarrow.compute.ReplaceSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def utf8_capitalize( - strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringScalarT: ... -@overload -def utf8_capitalize( - strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None -) -> _StringArrayT: ... -@overload -def utf8_capitalize( - strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def utf8_capitalize(*args, **kwargs): - """ - Capitalize the first character of input. - - For each string in `strings`, return a capitalized version, - with the first character uppercased and the others lowercased. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def utf8_length( - strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int32Scalar: ... -@overload -def utf8_length( - strings: lib.LargeStringScalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def utf8_length( - strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def utf8_length( - strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def utf8_length( - strings: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_length(*args, **kwargs): - """ - Compute UTF8 string lengths. - - For each string in `strings`, emit its length in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -utf8_lower = _clone_signature(utf8_capitalize) -""" -Transform input to lowercase. - -For each string in `strings`, return a lowercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def utf8_replace_slice( - strings: _StringScalarT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def utf8_replace_slice( - strings: _StringArrayT, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def utf8_replace_slice( - strings: Expression, - /, - start: int, - stop: int, - replacement: str | bytes, - *, - options: ReplaceSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_replace_slice(*args, **kwargs): - """ - Replace a slice of a string. - - For each string in `strings`, replace a slice of the string defined by `start` - and `stop` indices with the given `replacement`. `start` is inclusive - and `stop` is exclusive, and both are measured in UTF8 characters. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int - Index to stop slicing at (exclusive). - replacement : str - What to replace the slice with. - options : pyarrow.compute.ReplaceSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -utf8_reverse = _clone_signature(utf8_capitalize) -""" -Reverse input. - -For each string in `strings`, return a reversed version. - -This function operates on Unicode codepoints, not grapheme -clusters. Hence, it will not correctly reverse grapheme clusters -composed of multiple codepoints. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_swapcase = _clone_signature(utf8_capitalize) -""" -Transform input lowercase characters to uppercase and uppercase characters to lowercase. - -For each string in `strings`, return an opposite case version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_title = _clone_signature(utf8_capitalize) -""" -Titlecase each word of input. - -For each string in `strings`, return a titlecased version. -Each word in the output will start with an uppercase character and its -remaining characters will be lowercase. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_upper = _clone_signature(utf8_capitalize) -""" -Transform input to uppercase. - -For each string in `strings`, return an uppercase version. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory poo -""" - -# ========================= 2.12 String padding ========================= -@overload -def ascii_center( - strings: _StringScalarT, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_center( - strings: _StringArrayT, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_center( - strings: Expression, - /, - width: int, - padding: str = " ", - lean_left_on_odd_padding: bool = True, - *, - options: PadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_center(*args, **kwargs): - """ - Center strings by padding with a given character. - - For each string in `strings`, emit a centered string by padding both sides - with the given ASCII character. - Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - width : int - Desired string length. - padding : str, default " " - What to pad the string with. Should be one byte or codepoint. - lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). - options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given ASCII character. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_center = _clone_signature(ascii_center) -""" -Center strings by padding with a given character. - -For each string in `strings`, emit a centered string by padding both sides -with the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_lpad = _clone_signature(ascii_center) -""" -Right-align strings by padding with a given character. - -For each string in `strings`, emit a right-aligned string by prepending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rpad = _clone_signature(ascii_center) -""" -Left-align strings by padding with a given character. - -For each string in `strings`, emit a left-aligned string by appending -the given UTF8 codeunit. -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -width : int - Desired string length. -padding : str, default " " - What to pad the string with. Should be one byte or codepoint. -lean_left_on_odd_padding : bool, default True - What to do if there is an odd number of padding characters (in case - of centered padding). Defaults to aligning on the left (i.e. adding - the extra padding character on the right). -options : pyarrow.compute.PadOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.13 String trimming ========================= -@overload -def ascii_ltrim( - strings: _StringScalarT, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_ltrim( - strings: _StringArrayT, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_ltrim( - strings: Expression, - /, - characters: str, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_ltrim(*args, **kwargs): - """ - Trim leading characters. - - For each string in `strings`, remove any leading characters - from the `characters` option (as given in TrimOptions). - Null values emit null. - Both the `strings` and the `characters` are interpreted as - ASCII; to trim non-ASCII characters, use `utf8_ltrim`. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - characters : str - Individual characters to be trimmed from the string. - options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_rtrim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. -Both the `strings` and the `characters` are interpreted as -ASCII; to trim non-ASCII characters, use `utf8_trim`. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_ltrim = _clone_signature(ascii_ltrim) -""" -Trim leading characters. - -For each string in `strings`, remove any leading characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rtrim = _clone_signature(ascii_ltrim) -""" -Trim trailing characters. - -For each string in `strings`, remove any trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_trim = _clone_signature(ascii_ltrim) -""" -Trim leading and trailing characters. - -For each string in `strings`, remove any leading or trailing characters -from the `characters` option (as given in TrimOptions). -Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -characters : str - Individual characters to be trimmed from the string. -options : pyarrow.compute.TrimOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def ascii_ltrim_whitespace( - strings: _StringScalarT, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def ascii_ltrim_whitespace( - strings: _StringArrayT, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def ascii_ltrim_whitespace( - strings: Expression, - /, - *, - options: TrimOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_ltrim_whitespace(*args, **kwargs): - """ - Trim leading ASCII whitespace characters. - - For each string in `strings`, emit a string with leading ASCII whitespace - characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode - whitespace characters. Null values emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with trailing ASCII whitespace -characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing ASCII whitespace characters. - -For each string in `strings`, emit a string with leading and trailing ASCII -whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode -whitespace characters. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading whitespace characters. - -For each string in `strings`, emit a string with leading whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim trailing whitespace characters. - -For each string in `strings`, emit a string with trailing whitespace -characters removed, where whitespace characters are defined by the Unicode -standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) -""" -Trim leading and trailing whitespace characters. - -For each string in `strings`, emit a string with leading and trailing -whitespace characters removed, where whitespace characters are defined -by the Unicode standard. Null values emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.14 String splitting ========================= -@overload -def ascii_split_whitespace( - strings: _StringScalarT, - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringScalarT]: ... -@overload -def ascii_split_whitespace( - strings: lib.Array[lib.Scalar[_DataTypeT]], - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... -@overload -def ascii_split_whitespace( - strings: Expression, - /, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ascii_split_whitespace(*args, **kwargs): - """ - Split string according to any ASCII whitespace. - - Split each string according any non-zero length sequence of ASCII - whitespace characters. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def split_pattern( - strings: _StringOrBinaryScalarT, - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[_StringOrBinaryScalarT]: ... -@overload -def split_pattern( - strings: lib.Array[lib.Scalar[_DataTypeT]], - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitPatternOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... -@overload -def split_pattern( - strings: Expression, - /, - pattern: str, - *, - max_splits: int | None = None, - reverse: bool = False, - options: SplitPatternOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def split_pattern(*args, **kwargs): - """ - Split string according to separator. - - Split each string according to the exact `pattern` defined in - SplitPatternOptions. The output for each string input is a list - of strings. - - The maximum number of splits and direction of splitting - (forward, reverse) can optionally be defined in SplitPatternOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - String pattern to split on. - max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). - reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. - options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -split_pattern_regex = _clone_signature(split_pattern) -""" -Split string according to regex pattern. - -Split each string according to the regex `pattern` defined in -SplitPatternOptions. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitPatternOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - String pattern to split on. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitPatternOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -utf8_split_whitespace = _clone_signature(ascii_split_whitespace) -""" -Split string according to any Unicode whitespace. - -Split each string according any non-zero length sequence of Unicode -whitespace characters. The output for each string input is a list -of strings. - -The maximum number of splits and direction of splitting -(forward, reverse) can optionally be defined in SplitOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -max_splits : int or None, default None - Maximum number of splits for each input value (unlimited if None). -reverse : bool, default False - Whether to start splitting from the end of each input value. - This only has an effect if `max_splits` is not None. -options : pyarrow.compute.SplitOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.15 String component extraction ========================= -@overload -def extract_regex( - strings: StringOrBinaryScalar, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: ... -@overload -def extract_regex( - strings: StringOrBinaryArray, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... -@overload -def extract_regex( - strings: Expression, - /, - pattern: str, - *, - options: ExtractRegexOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def extract_regex(*args, **kwargs): - """ - Extract substrings captured by a regex pattern. - - For each string in `strings`, match the regular expression and, if - successful, emit a struct with field names and values coming from the - regular expression's named capture groups. If the input is null or the - regular expression fails matching, a null output value is emitted. - - Regular expression matching is done using the Google RE2 library. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Regular expression with named capture fields. - options : pyarrow.compute.ExtractRegexOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.16 String join ========================= -def binary_join( - strings, separator, /, *, memory_pool: lib.MemoryPool | None = None -) -> StringScalar | StringArray: - """ - Join a list of strings together with a separator. - - Concatenate the strings in `list`. The `separator` is inserted - between each given string. - Any null input and any null `list` element emits a null output. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - separator : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def binary_join_element_wise( - *strings: _StringOrBinaryScalarT, - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryScalarT: ... -@overload -def binary_join_element_wise( - *strings: _StringOrBinaryArrayT, - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringOrBinaryArrayT: ... -@overload -def binary_join_element_wise( - *strings: Expression, - null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", - null_replacement: str = "", - options: JoinOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_join_element_wise(*args, **kwargs): - """ - Join string arguments together, with the last argument as separator. - - Concatenate the `strings` except for the last one. The last argument - in `strings` is inserted between each given string. - Any null separator element emits a null output. Null elements either - emit a null (the default), are skipped, or replaced with a given string. - - Parameters - ---------- - *strings : Array-like or scalar-like - Argument to compute function. - null_handling : str, default "emit_null" - How to handle null values in the inputs. - Accepted values are "emit_null", "skip", "replace". - null_replacement : str, default "" - Replacement string to emit for null inputs if `null_handling` - is "replace". - options : pyarrow.compute.JoinOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.17 String Slicing ========================= -@overload -def binary_slice( - strings: _BinaryScalarT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _BinaryScalarT: ... -@overload -def binary_slice( - strings: _BinaryArrayT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _BinaryArrayT: ... -@overload -def binary_slice( - strings: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def binary_slice(*args, **kwargs): - """ - Slice binary string. - - For each binary string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - bytes. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def utf8_slice_codeunits( - strings: _StringScalarT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringScalarT: ... -@overload -def utf8_slice_codeunits( - strings: _StringArrayT, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _StringArrayT: ... -@overload -def utf8_slice_codeunits( - strings: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - *, - options: SliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def utf8_slice_codeunits(*args, **kwargs): - """ - Slice string. - - For each string in `strings`, emit the substring defined by - (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is - inclusive and `stop` is exclusive. All three values are measured in - UTF8 codeunits. - If `step` is negative, the string will be advanced in reversed order. - An error is raised if `step` is zero. - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing at (inclusive). - stop : int or None, default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. - step : int, default 1 - Slice step. - options : pyarrow.compute.SliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.18 Containment tests ========================= -@overload -def count_substring( - strings: lib.StringScalar | lib.BinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar: ... -@overload -def count_substring( - strings: lib.LargeStringScalar | lib.LargeBinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def count_substring( - strings: lib.StringArray - | lib.BinaryArray - | lib.ChunkedArray[lib.StringScalar] - | lib.ChunkedArray[lib.BinaryScalar], - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def count_substring( - strings: lib.LargeStringArray - | lib.LargeBinaryArray - | lib.ChunkedArray[lib.LargeStringScalar] - | lib.ChunkedArray[lib.LargeBinaryScalar], - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def count_substring( - strings: Expression, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def count_substring(*args, **kwargs): - """ - Count occurrences of substring. - - For each string in `strings`, emit the number of occurrences of the given - literal pattern. - Null inputs emit null. The pattern must be given in MatchSubstringOptions. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -count_substring_regex = _clone_signature(count_substring) -""" -Count occurrences of substring. - -For each string in `strings`, emit the number of occurrences of the given -regular expression pattern. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def ends_with( - strings: StringScalar | BinaryScalar, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def ends_with( - strings: StringArray | BinaryArray, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def ends_with( - strings: Expression, - /, - pattern: str, - *, - ignore_case: bool = False, - options: MatchSubstringOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ends_with(*args, **kwargs): - """ - Check if strings end with a literal pattern. - - For each string in `strings`, emit true iff it ends with a given pattern. - The pattern must be given in MatchSubstringOptions. - If ignore_case is set, only simple case folding is performed. - - Null inputs emit null. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - pattern : str - Substring pattern to look for inside input values. - ignore_case : bool, default False - Whether to perform a case-insensitive match. - options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -find_substring = _clone_signature(count_substring) -""" -Find first occurrence of substring. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -find_substring_regex = _clone_signature(count_substring) -""" -Find location of first match of regex pattern. - -For each string in `strings`, emit the index in bytes of the first occurrence -of the given literal pattern, or -1 if not found. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def index_in( - values: lib.Scalar, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Scalar: ... -@overload -def index_in( - values: lib.Array | lib.ChunkedArray, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def index_in( - values: Expression, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def index_in(*args, **kwargs): - """ - Return index of each element in a set of values. - - For each element in `values`, return its index in a given set of - values, or null if it is not found there. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_in( - values: lib.Scalar, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_in( - values: lib.Array | lib.ChunkedArray, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_in( - values: Expression, - /, - value_set: lib.Array | lib.ChunkedArray, - *, - skip_nulls: bool = False, - options: SetLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_in(*args, **kwargs): - """ - Find each element in a set of values. - - For each element in `values`, return true if it is found in a given - set of values, false otherwise. - The set of values to look for must be given in SetLookupOptions. - By default, nulls are matched against the value set, this can be - changed in SetLookupOptions. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - value_set : Array - Set of values to look for in the input. - skip_nulls : bool, default False - If False, nulls in the input are matched in the value_set just - like regular values. - If True, nulls in the input always fail matching. - options : pyarrow.compute.SetLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -match_like = _clone_signature(ends_with) -""" -Match strings against SQL-style LIKE pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. '%' will match any number of characters, '_' will -match exactly one character, and any other character matches itself. -To match a literal '%', '_', or '\', precede the character with a backslash. -Null inputs emit null. The pattern must be given in MatchSubstringOptions. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -match_substring = _clone_signature(ends_with) -""" -Match strings against literal pattern. - -For each string in `strings`, emit true iff it contains a given pattern. -Null inputs emit null. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -match_substring_regex = _clone_signature(ends_with) -""" -Match strings against regex pattern. - -For each string in `strings`, emit true iff it matches a given pattern -at any position. The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -starts_with = _clone_signature(ends_with) -""" -Check if strings start with a literal pattern. - -For each string in `strings`, emit true iff it starts with a given pattern. -The pattern must be given in MatchSubstringOptions. -If ignore_case is set, only simple case folding is performed. - -Null inputs emit null. - -Parameters ----------- -strings : Array-like or scalar-like - Argument to compute function. -pattern : str - Substring pattern to look for inside input values. -ignore_case : bool, default False - Whether to perform a case-insensitive match. -options : pyarrow.compute.MatchSubstringOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.19 Categorizations ========================= -@overload -def is_finite( - values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_finite( - values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def is_finite( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def is_finite(*args, **kwargs): - """ - Return true if value is finite. - - For each input value, emit true iff the value is finite - (i.e. neither NaN, inf, nor -inf). - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -is_inf = _clone_signature(is_finite) -""" -Return true if infinity. - -For each input value, emit true iff the value is infinite (inf or -inf). - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -is_nan = _clone_signature(is_finite) -""" -Return true if NaN. - -For each input value, emit true iff the value is NaN. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def is_null( - values: lib.Scalar, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_null( - values: lib.Array | lib.ChunkedArray, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_null( - values: Expression, - /, - *, - nan_is_null: bool = False, - options: NullOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_null(*args, **kwargs): - """ - Return true if null (and optionally NaN). - - For each input value, emit true iff the value is null. - True may also be emitted for NaN values by setting the `nan_is_null` flag. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - nan_is_null : bool, default False - Whether floating-point NaN values are considered null. - options : pyarrow.compute.NullOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_valid( - values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_valid( - values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanArray: ... -@overload -def is_valid( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def is_valid(*args, **kwargs): - """ - Return true if non-null. - - For each input value, emit true iff the value is valid (i.e. non-null). - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -true_unless_null = _clone_signature(is_valid) -""" -Return true if non-null, else return null. - -For each input value, emit true iff the value -is valid (non-null), otherwise emit null. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.20 Selecting / multiplexing ========================= -def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): - """ - Choose values based on multiple conditions. - - `cond` must be a struct of Boolean values. `cases` can be a mix - of scalar and array arguments (of any type, but all must be the - same type or castable to a common type), with either exactly one - datum per child of `cond`, or one more `cases` than children of - `cond` (in which case we have an "else" value). - - Each row of the output will be the corresponding value of the - first datum in `cases` for which the corresponding child of `cond` - is true, or otherwise the "else" value (if given), or null. - - Essentially, this implements a switch-case or if-else, if-else... statement. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - *cases : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): - """ - Choose values from several arrays. - - For each row, the value of the first argument is used as a 0-based index - into the list of `values` arrays (i.e. index 0 selects the first of the - `values` arrays). The output value is the corresponding value of the - selected argument. - - If an index is null, the output will be null. - - Parameters - ---------- - indices : Array-like or scalar-like - Argument to compute function. - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def coalesce( - *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None -) -> _ScalarOrArrayT: - """ - Select the first non-null value. - - Each row of the output will be the value from the first corresponding input - for which the value is not null. If all inputs are null in a row, the output - will be null. - - Parameters - ---------- - *values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -fill_null = coalesce -"""Replace each null element in values with a corresponding -element from fill_value. - -If fill_value is scalar-like, then every null element in values -will be replaced with fill_value. If fill_value is array-like, -then the i-th element in values will be replaced with the i-th -element in fill_value. - -The fill_value's type must be the same as that of values, or it -must be able to be implicitly casted to the array's type. - -This is an alias for :func:`coalesce`. - -Parameters ----------- -values : Array, ChunkedArray, or Scalar-like object - Each null element is replaced with the corresponding value - from fill_value. -fill_value : Array, ChunkedArray, or Scalar-like object - If not same type as values, will attempt to cast. - -Returns -------- -result : depends on inputs - Values with all null elements replaced - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array([1, 2, None, 3], type=pa.int8()) ->>> fill_value = pa.scalar(5, type=pa.int8()) ->>> arr.fill_null(fill_value) - -[ - 1, - 2, - 5, - 3 -] ->>> arr = pa.array([1, 2, None, 4, None]) ->>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) - -[ - 1, - 2, - 30, - 4, - 50 -] -""" - -def if_else( - cond: ArrayLike | ScalarLike, - left: ArrayLike | ScalarLike, - right: ArrayLike | ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> ArrayLike | ScalarLike: - """ - Choose values based on a condition. - - `cond` must be a Boolean scalar/ array. - `left` or `right` must be of the same type scalar/ array. - `null` values in `cond` will be promoted to the output. - - Parameters - ---------- - cond : Array-like or scalar-like - Argument to compute function. - left : Array-like or scalar-like - Argument to compute function. - right : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.21 Structural transforms ========================= - -@overload -def list_value_length( - lists: _ListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array: ... -@overload -def list_value_length( - lists: _LargeListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def list_value_length( - lists: ListArray[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int32Array | lib.Int64Array: ... -@overload -def list_value_length( - lists: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def list_value_length(*args, **kwargs): - """ - Compute list lengths. - - `lists` must have a list-like type. - For each non-null value in `lists`, its length is emitted. - Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def make_struct( - *args: lib.Scalar, - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructScalar: ... -@overload -def make_struct( - *args: lib.Array | lib.ChunkedArray, - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StructArray: ... -@overload -def make_struct( - *args: Expression, - field_names: list[str] | tuple[str, ...] = (), - field_nullability: bool | None = None, - field_metadata: list[lib.KeyValueMetadata] | None = None, - options: MakeStructOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def make_struct(*args, **kwargs): - """ - Wrap Arrays into a StructArray. - - Names of the StructArray's fields are - specified through MakeStructOptions. - - Parameters - ---------- - *args : Array-like or scalar-like - Argument to compute function. - field_names : sequence of str - Names of the struct fields to create. - field_nullability : sequence of bool, optional - Nullability information for each struct field. - If omitted, all fields are nullable. - field_metadata : sequence of KeyValueMetadata, optional - Metadata for each struct field. - options : pyarrow.compute.MakeStructOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.22 Conversions ========================= -@overload -def ceil_temporal( - timestamps: _TemporalScalarT, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _TemporalScalarT: ... -@overload -def ceil_temporal( - timestamps: _TemporalArrayT, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _TemporalArrayT: ... -@overload -def ceil_temporal( - timestamps: Expression, - /, - multiple: int = 1, - unit: Literal[ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - "nanosecond", - ] = "day", - *, - week_starts_monday: bool = True, - ceil_is_strictly_greater: bool = False, - calendar_based_origin: bool = False, - options: RoundTemporalOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def ceil_temporal(*args, **kwargs): - """ - Round temporal values up to nearest multiple of specified time unit. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - multiple : int, default 1 - Number of units to round to. - unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. - calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. - options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -floor_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values down to nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -round_temporal = _clone_signature(ceil_temporal) -""" -Round temporal values to the nearest multiple of specified time unit. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -timestamps : Array-like or scalar-like - Argument to compute function. -multiple : int, default 1 - Number of units to round to. -unit : str, default "day" - The unit in which `multiple` is expressed. - Accepted values are "year", "quarter", "month", "week", "day", - "hour", "minute", "second", "millisecond", "microsecond", - "nanosecond". -week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. -ceil_is_strictly_greater : bool, default False - If True, ceil returns a rounded value that is strictly greater than the - input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would - yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 - if set to False. - This applies to the ceil_temporal function only. -calendar_based_origin : bool, default False - By default, the origin is 1970-01-01T00:00:00. By setting this to True, - rounding origin will be beginning of one less precise calendar unit. - E.g.: rounding to hours will use beginning of day as origin. - - By default time is rounded to a multiple of units since - 1970-01-01T00:00:00. By setting calendar_based_origin to true, - time will be rounded to number of units since the last greater - calendar unit. - For example: rounding to multiple of days since the beginning of the - month or to hours since the beginning of the day. - Exceptions: week and quarter are not used as greater units, - therefore days will be rounded to the beginning of the month not - week. Greater unit of week is a year. - Note that ceiling and rounding might change sorting order of an array - near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to - 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to - YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will - ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the - order of an already ordered array. -options : pyarrow.compute.RoundTemporalOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def cast( - arr: lib.Scalar, - target_type: _DataTypeT, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Scalar[_DataTypeT]: ... -@overload -def cast( - arr: lib.Array, - target_type: _DataTypeT, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[lib.Scalar[_DataTypeT]]: ... -@overload -def cast( - arr: lib.ChunkedArray, - target_type: _DataTypeT, - safe: bool | None = None, - options: CastOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... -def cast(*args, **kwargs): - """ - Cast array values to another data type. Can also be invoked as an array - instance method. - - Parameters - ---------- - arr : Array-like - target_type : DataType or str - Type to cast to - safe : bool, default True - Check for overflows or other unsafe conversions - options : CastOptions, default None - Additional checks pass by CastOptions - memory_pool : MemoryPool, optional - memory pool to use for allocations during function execution. - - Examples - -------- - >>> from datetime import datetime - >>> import pyarrow as pa - >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) - >>> arr.type - TimestampType(timestamp[us]) - - You can use ``pyarrow.DataType`` objects to specify the target type: - - >>> cast(arr, pa.timestamp("ms")) - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - - >>> cast(arr, pa.timestamp("ms")).type - TimestampType(timestamp[ms]) - - Alternatively, it is also supported to use the string aliases for these - types: - - >>> arr.cast("timestamp[ms]") - - [ - 2010-01-01 00:00:00.000, - 2015-01-01 00:00:00.000 - ] - >>> arr.cast("timestamp[ms]").type - TimestampType(timestamp[ms]) - - Returns - ------- - casted : Array - The cast result as a new Array - """ - -@overload -def strftime( - timestamps: TemporalScalar, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StringScalar: ... -@overload -def strftime( - timestamps: TemporalArray, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.StringArray: ... -@overload -def strftime( - timestamps: Expression, - /, - format: str = "%Y-%m-%dT%H:%M:%S", - locale: str = "C", - *, - options: StrftimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def strftime(*args, **kwargs): - """ - Format temporal values according to a format string. - - For each input value, emit a formatted string. - The time format string and locale can be set using StrftimeOptions. - The output precision of the "%S" (seconds) format code depends on - the input time precision: it is an integer for timestamps with - second precision, a real number with the required number of fractional - digits for higher precisions. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database, or if the specified locale - does not exist on this system. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - format : str, default "%Y-%m-%dT%H:%M:%S" - Pattern for formatting input values. - locale : str, default "C" - Locale to use for locale-specific format specifiers. - options : pyarrow.compute.StrftimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def strptime( - strings: StringScalar, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar: ... -@overload -def strptime( - strings: StringArray, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... -@overload -def strptime( - strings: Expression, - /, - format: str, - unit: Literal["s", "ms", "us", "ns"], - error_is_null: bool = False, - *, - options: StrptimeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def strptime(*args, **kwargs): - """ - Parse timestamps. - - For each string in `strings`, parse it as a timestamp. - The timestamp unit and the expected string pattern must be given - in StrptimeOptions. Null inputs emit null. If a non-null string - fails parsing, an error is returned by default. - - Parameters - ---------- - strings : Array-like or scalar-like - Argument to compute function. - format : str - Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". - Note that the semantics of the format follow the C/C++ strptime, not the Python one. - There are differences in behavior, for example how the "%y" placeholder - handles years with less than four digits. - unit : str - Timestamp unit of the output. - Accepted values are "s", "ms", "us", "ns". - error_is_null : boolean, default False - Return null on parsing errors if true or raise if false. - options : pyarrow.compute.StrptimeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.23 Temporal component extraction ========================= -@overload -def day( - values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar: ... -@overload -def day( - values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Array: ... -@overload -def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def day(*args, **kwargs): - """ - Extract day number. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def day_of_week( - values: TemporalScalar, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def day_of_week( - values: TemporalArray, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def day_of_week( - values: Expression, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def day_of_week(*args, **kwargs): - """ - Extract day of the week number. - - By default, the week starts on Monday represented by 0 and ends on Sunday - represented by 6. - `DayOfWeekOptions.week_start` can be used to set another starting day using - the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). - Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -day_of_year = _clone_signature(day) -""" -Extract day of year number. - -January 1st maps to day number 1, February 1st to 32, etc. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def hour( - values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def hour( - values: lib.TimestampArray[Any] - | lib.Time32Array[Any] - | lib.Time64Array[Any] - | lib.ChunkedArray[lib.TimestampScalar[Any]] - | lib.ChunkedArray[lib.Time32Scalar[Any]] - | lib.ChunkedArray[lib.Time64Scalar[Any]], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def hour( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def hour(*args, **kwargs): - """ - Extract hour value. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def is_dst( - values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.BooleanScalar: ... -@overload -def is_dst( - values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -def is_dst(*args, **kwargs): - """ - Extracts if currently observing daylight savings. - - IsDaylightSavings returns true if a timestamp has a daylight saving - offset in the given timezone. - Null values emit null. - An error is returned if the values do not have a defined timezone. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def iso_week( - values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar: ... -@overload -def iso_week( - values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def iso_week( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def iso_week(*args, **kwargs): - """ - Extract ISO week of year number. - - First ISO week has the majority (4 or more) of its days in January. - ISO week starts on Monday. The week number starts with 1 and can run - up to 53. - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -iso_year = _clone_signature(iso_week) -""" -Extract ISO year number. - -First week of an ISO year has the majority (4 or more) of its days in January. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def is_leap_year( - values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanScalar: ... -@overload -def is_leap_year( - values: lib.TimestampArray - | lib.Date32Array - | lib.Date64Array - | lib.ChunkedArray[lib.TimestampScalar] - | lib.ChunkedArray[lib.Date32Scalar] - | lib.ChunkedArray[lib.Date64Scalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.BooleanArray: ... -@overload -def is_leap_year( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def is_leap_year(*args, **kwargs): - """ - Extract if year is a leap year. - - Null values emit null. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -microsecond = _clone_signature(iso_week) -""" -Extract microsecond values. - -Microsecond returns number of microseconds since the last full millisecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -millisecond = _clone_signature(iso_week) -""" -Extract millisecond values. - -Millisecond returns number of milliseconds since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -minute = _clone_signature(iso_week) -""" -Extract minute values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -month = _clone_signature(day_of_week) -""" -Extract month number. - -Month is encoded as January=1, December=12. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -nanosecond = _clone_signature(hour) -""" -Extract nanosecond values. - -Nanosecond returns number of nanoseconds since the last full microsecond. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -quarter = _clone_signature(day_of_week) -""" -Extract quarter of year number. - -First quarter maps to 1 and forth quarter maps to 4. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -second = _clone_signature(hour) -""" -Extract second values. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -subsecond = _clone_signature(hour) -""" -Extract subsecond values. - -Subsecond returns the fraction of a second since the last full second. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -us_week = _clone_signature(iso_week) -""" -Extract US week of year number. - -First US week has the majority (4 or more) of its days in January. -US week starts on Monday. The week number starts with 1 and can run -up to 53. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -us_year = _clone_signature(iso_week) -""" -Extract US epidemiological year number. - -First week of US epidemiological year has the majority (4 or more) of -it's days in January. Last week of US epidemiological year has the -year's last Wednesday in it. US epidemiological week starts on Sunday. -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -year = _clone_signature(iso_week) -""" -Extract year number. - -Null values emit null. -An error is returned if the values have a defined timezone but it -cannot be found in the timezone database. - -Parameters ----------- -values : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -@overload -def week( - values: lib.TimestampScalar, - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar: ... -@overload -def week( - values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Array: ... -@overload -def week( - values: Expression, - /, - *, - week_starts_monday: bool = True, - count_from_zero: bool = False, - first_week_is_fully_in_year: bool = False, - options: WeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def week(*args, **kwargs): - """ - Extract week of year number. - - First week has the majority (4 or more) of its days in January. - Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using - DayOfWeekOptions.count_from_zero. - An error is returned if the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - week_starts_monday : bool, default True - If True, weeks start on Monday; if False, on Sunday. - count_from_zero : bool, default False - If True, dates at the start of a year that fall into the last week - of the previous year emit 0. - If False, they emit 52 or 53 (the week number of the last week - of the previous year). - first_week_is_fully_in_year : bool, default False - If True, week number 0 is fully in January. - If False, a week that begins on December 29, 30 or 31 is considered - to be week number 0 of the following year. - options : pyarrow.compute.WeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def year_month_day( - values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructScalar: ... -@overload -def year_month_day( - values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructArray: ... -@overload -def year_month_day( - values: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def year_month_day(*args, **kwargs): - """ - Extract (year, month, day) struct. - - Null values emit null. - An error is returned in the values have a defined timezone but it - cannot be found in the timezone database. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.24 Temporal difference ========================= -def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of days and milliseconds between two timestamps. - - Returns the number of days and milliseconds from `start` to `end`. - That is, first the difference in days is computed as if both - timestamps were truncated to the day, then the difference between time times - of the two timestamps is computed as if both times were truncated to the - millisecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def days_between( - start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of days between two timestamps. - - Returns the number of day boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the day. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -hours_between = _clone_signature(days_between) -""" -Compute the number of hours between two timestamps. - -Returns the number of hour boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the hour. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -microseconds_between = _clone_signature(days_between) -""" -Compute the number of microseconds between two timestamps. - -Returns the number of microsecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the microsecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -milliseconds_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -minutes_between = _clone_signature(days_between) -""" -Compute the number of millisecond boundaries between two timestamps. - -Returns the number of millisecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the millisecond. -Null values emit null. -In [152]: print(pc.minutes_between.__doc__) -Compute the number of minute boundaries between two timestamps. - -Returns the number of minute boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the minute. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def month_day_nano_interval_between( - start, end, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: - """ - Compute the number of months, days and nanoseconds between two timestamps. - - Returns the number of months, days, and nanoseconds from `start` to `end`. - That is, first the difference in months is computed as if both timestamps - were truncated to the months, then the difference between the days - is computed, and finally the difference between the times of the two - timestamps is computed as if both times were truncated to the nanosecond. - Null values return null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Compute the number of months between two timestamps. - - Returns the number of month boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the month. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -nanoseconds_between = _clone_signature(days_between) -""" -Compute the number of nanoseconds between two timestamps. - -Returns the number of nanosecond boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the nanosecond. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -quarters_between = _clone_signature(days_between) -""" -Compute the number of quarters between two timestamps. - -Returns the number of quarter start boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the quarter. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -seconds_between = _clone_signature(days_between) -""" -Compute the number of seconds between two timestamps. - -Returns the number of second boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the second. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -def weeks_between( - start, - end, - /, - *, - count_from_zero: bool = True, - week_start: int = 1, - options: DayOfWeekOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Int64Scalar | lib.Int64Array: - """ - Compute the number of weeks between two timestamps. - - Returns the number of week boundaries crossed from `start` to `end`. - That is, the difference is calculated as if the timestamps were - truncated to the week. - Null values emit null. - - Parameters - ---------- - start : Array-like or scalar-like - Argument to compute function. - end : Array-like or scalar-like - Argument to compute function. - count_from_zero : bool, default True - If True, number days from 0, otherwise from 1. - week_start : int, default 1 - Which day does the week start with (Monday=1, Sunday=7). - How this value is numbered is unaffected by `count_from_zero`. - options : pyarrow.compute.DayOfWeekOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -years_between = _clone_signature(days_between) -""" -Compute the number of years between two timestamps. - -Returns the number of year boundaries crossed from `start` to `end`. -That is, the difference is calculated as if the timestamps were -truncated to the year. -Null values emit null. - -Parameters ----------- -start : Array-like or scalar-like - Argument to compute function. -end : Array-like or scalar-like - Argument to compute function. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" - -# ========================= 2.25 Timezone handling ========================= -@overload -def assume_timezone( - timestamps: lib.TimestampScalar, - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampScalar: ... -@overload -def assume_timezone( - timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... -@overload -def assume_timezone( - timestamps: Expression, - /, - timezone: str, - *, - ambiguous: Literal["raise", "earliest", "latest"] = "raise", - nonexistent: Literal["raise", "earliest", "latest"] = "raise", - options: AssumeTimezoneOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def assume_timezone(*args, **kwargs): - """ - Convert naive timestamp to timezone-aware timestamp. - - Input timestamps are assumed to be relative to the timezone given in the - `timezone` option. They are converted to UTC-relative timestamps and - the output type has its timezone set to the value of the `timezone` - option. Null values emit null. - This function is meant to be used when an external system produces - "timezone-naive" timestamps which need to be converted to - "timezone-aware" timestamps. An error is returned if the timestamps - already have a defined timezone. - - Parameters - ---------- - timestamps : Array-like or scalar-like - Argument to compute function. - timezone : str - Timezone to assume for the input. - ambiguous : str, default "raise" - How to handle timestamps that are ambiguous in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - nonexistent : str, default "raise" - How to handle timestamps that don't exist in the assumed timezone. - Accepted values are "raise", "earliest", "latest". - options : pyarrow.compute.AssumeTimezoneOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def local_timestamp( - timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.TimestampScalar: ... -@overload -def local_timestamp( - timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.TimestampArray: ... -@overload -def local_timestamp( - timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -def local_timestamp(*args, **kwargs): - """ - Convert timestamp to a timezone-naive local time timestamp. - - LocalTimestamp converts timezone-aware timestamp to local timestamp - of the given timestamp's timezone and removes timezone metadata. - Alternative name for this timestamp is also wall clock time. - If input is in UTC or without timezone, then unchanged input values - without timezone metadata are returned. - Null values emit null. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 2.26 Random number generation ========================= -def random( - n: int, - *, - initializer: Literal["system"] | int = "system", - options: RandomOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.DoubleArray: - """ - Generate numbers in the range [0, 1). - - Generated values are uniformly-distributed, double-precision - in range [0, 1). Algorithm and seed can be changed via RandomOptions. - - Parameters - ---------- - n : int - Number of values to generate, must be greater than or equal to 0 - initializer : int or str - How to initialize the underlying random generator. - If an integer is given, it is used as a seed. - If "system" is given, the random generator is initialized with - a system-specific source of (hopefully true) randomness. - Other values are invalid. - options : pyarrow.compute.RandomOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3. Array-wise (“vector”) functions ========================= - -# ========================= 3.1 Cumulative Functions ========================= -@overload -def cumulative_sum( - values: _NumericArrayT, - /, - start: lib.Scalar | None = None, - *, - skip_nulls: bool = False, - options: CumulativeSumOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericArrayT: ... -@overload -def cumulative_sum( - values: Expression, - /, - start: lib.Scalar | None = None, - *, - skip_nulls: bool = False, - options: CumulativeSumOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def cumulative_sum(*args, **kwargs): - """ - Compute the cumulative sum over a numeric input. - - `values` must be numeric. Return an array/chunked array which is the - cumulative sum computed over `values`. Results will wrap around on - integer overflow. Use function "cumulative_sum_checked" if you want - overflow to return an error. The default start is 0. - - Parameters - ---------- - values : Array-like - Argument to compute function. - start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. - skip_nulls : bool, default False - When false, the first encountered null is propagated. - options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -cumulative_sum_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative sum over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative sum computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_sum". The default start is 0. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_prod = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. Results will wrap around on -integer overflow. Use function "cumulative_prod_checked" if you want -overflow to return an error. The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_prod_checked = _clone_signature(cumulative_sum) -""" -Compute the cumulative product over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative product computed over `values`. This function returns an error -on overflow. For a variant that doesn't fail on overflow, use -function "cumulative_prod". The default start is 1. - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_max = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_min = _clone_signature(cumulative_sum) -""" -Compute the cumulative min over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative min computed over `values`. The default start is the maximum -value of input type (so that any other value will replace the -start as the new minimum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -cumulative_mean = _clone_signature(cumulative_sum) -""" -Compute the cumulative max over a numeric input. - -`values` must be numeric. Return an array/chunked array which is the -cumulative max computed over `values`. The default start is the minimum -value of input type (so that any other value will replace the -start as the new maximum). - -Parameters ----------- -values : Array-like - Argument to compute function. -start : Scalar, default None - Starting value for the cumulative operation. If none is given, - a default value depending on the operation and input type is used. -skip_nulls : bool, default False - When false, the first encountered null is propagated. -options : pyarrow.compute.CumulativeOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" -# ========================= 3.2 Associative transforms ========================= - -@overload -def dictionary_encode( - array: _ScalarOrArrayT, - /, - null_encoding: Literal["mask", "encode"] = "mask", - *, - options=None, - memory_pool: lib.MemoryPool | None = None, -) -> _ScalarOrArrayT: ... -@overload -def dictionary_encode( - array: Expression, - /, - null_encoding: Literal["mask", "encode"] = "mask", - *, - options=None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... -@overload -def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... -@overload -def value_counts( - array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.StructArray: ... -@overload -def value_counts( - array: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... - -# ========================= 3.3 Selections ========================= -@overload -def array_filter( - array: _ArrayT, - selection_filter: list[bool] | list[bool | None] | BooleanArray, - /, - null_selection_behavior: Literal["drop", "emit_null"] = "drop", - *, - options: FilterOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ArrayT: ... -@overload -def array_filter( - array: Expression, - selection_filter: list[bool] | list[bool | None] | BooleanArray, - /, - null_selection_behavior: Literal["drop", "emit_null"] = "drop", - *, - options: FilterOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def array_take( - array: _ArrayT, - indices: list[int] - | list[int | None] - | lib.Int16Array - | lib.Int32Array - | lib.Int64Array - | lib.ChunkedArray[lib.Int16Scalar] - | lib.ChunkedArray[lib.Int32Scalar] - | lib.ChunkedArray[lib.Int64Scalar], - /, - *, - boundscheck: bool = True, - options: TakeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _ArrayT: ... -@overload -def array_take( - array: Expression, - indices: list[int] - | list[int | None] - | lib.Int16Array - | lib.Int32Array - | lib.Int64Array - | lib.ChunkedArray[lib.Int16Scalar] - | lib.ChunkedArray[lib.Int32Scalar] - | lib.ChunkedArray[lib.Int64Scalar], - /, - *, - boundscheck: bool = True, - options: TakeOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... -@overload -def drop_null( - input: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... - -filter = array_filter -take = array_take -""" -Select values (or records) from array- or table-like data given integer -selection indices. - -The result will be of the same type(s) as the input, with elements taken -from the input array (or record batch / table fields) at the given -indices. If an index is null then the corresponding value in the output -will be null. - -Parameters ----------- -data : Array, ChunkedArray, RecordBatch, or Table -indices : Array, ChunkedArray - Must be of integer type -boundscheck : boolean, default True - Whether to boundscheck the indices. If False and there is an out of - bounds index, will likely cause the process to crash. -memory_pool : MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - -Returns -------- -result : depends on inputs - Selected values for the given indices - -Examples --------- ->>> import pyarrow as pa ->>> arr = pa.array(["a", "b", "c", None, "e", "f"]) ->>> indices = pa.array([0, None, 4, 3]) ->>> arr.take(indices) - -[ - "a", - null, - "e", - null -] -""" - -# ========================= 3.4 Containment tests ========================= -@overload -def indices_nonzero( - values: lib.BooleanArray - | lib.NullArray - | NumericArray - | lib.Decimal128Array - | lib.Decimal256Array, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def indices_nonzero( - values: Expression, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def indices_nonzero(*args, **kwargs): - """ - Return the indices of the values in the array that are non-zero. - - For each input value, check if it's zero, false or null. Emit the index - of the value in the array if it's none of the those. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.5 Sorts and partitions ========================= -@overload -def array_sort_indices( - array: lib.Array | lib.ChunkedArray, - /, - order: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - options: ArraySortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def array_sort_indices( - array: Expression, - /, - order: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - options: ArraySortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def array_sort_indices(*args, **kwargs): - """ - Return the indices that would sort an array. - - This function computes an array of indices that define a stable sort - of the input array. By default, Null values are considered greater - than any other value and are therefore sorted at the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in ArraySortOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - order : str, default "ascending" - Which order to sort values in. - Accepted values are "ascending", "descending". - null_placement : str, default "at_end" - Where nulls in the input should be sorted. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.ArraySortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def partition_nth_indices( - array: lib.Array | lib.ChunkedArray, - /, - pivot: int, - *, - null_placement: _Placement = "at_end", - options: PartitionNthOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def partition_nth_indices( - array: Expression, - /, - pivot: int, - *, - null_placement: _Placement = "at_end", - options: PartitionNthOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def partition_nth_indices(*args, **kwargs): - """ - Return the indices that would partition an array around a pivot. - - This functions computes an array of indices that define a non-stable - partial sort of the input array. - - The output is such that the `N`'th index points to the `N`'th element - of the input in sorted order, and all indices before the `N`'th point - to elements in the input less or equal to elements at or after the `N`'th. - - By default, null values are considered greater than any other value - and are therefore partitioned towards the end of the array. - For floating-point types, NaNs are considered greater than any - other non-null value, but smaller than null values. - - The pivot index `N` must be given in PartitionNthOptions. - The handling of nulls and NaNs can also be changed in PartitionNthOptions. - - Parameters - ---------- - array : Array-like - Argument to compute function. - pivot : int - Index into the equivalent sorted array of the pivot element. - null_placement : str, default "at_end" - Where nulls in the input should be partitioned. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.PartitionNthOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def rank( - input: lib.Array | lib.ChunkedArray, - /, - sort_keys: _Order = "ascending", - *, - null_placement: _Placement = "at_end", - tiebreaker: Literal["min", "max", "first", "dense"] = "first", - options: RankOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: - """ - Compute ordinal ranks of an array (1-based). - - This function computes a rank of the input array. - By default, null values are considered greater than any other value and - are therefore sorted at the end of the input. For floating-point types, - NaNs are considered greater than any other non-null value, but smaller - than null values. The default tiebreaker is to assign ranks in order of - when ties appear in the input. - - The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples or str, default "ascending" - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - Alternatively, one can simply pass "ascending" or "descending" as a string - if the input is array-like. - null_placement : str, default "at_end" - Where nulls in input should be sorted. - Accepted values are "at_start", "at_end". - tiebreaker : str, default "first" - Configure how ties between equal values are handled. - Accepted values are: - - - "min": Ties get the smallest possible rank in sorted order. - - "max": Ties get the largest possible rank in sorted order. - - "first": Ranks are assigned in order of when ties appear in the - input. This ensures the ranks are a stable permutation - of the input. - - "dense": The ranks span a dense [1, M] interval where M is the - number of distinct values in the input. - options : pyarrow.compute.RankOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def select_k_unstable( - input: lib.Array | lib.ChunkedArray, - /, - k: int, - sort_keys: list[tuple[str, _Order]], - *, - options: SelectKOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def select_k_unstable( - input: Expression, - /, - k: int, - sort_keys: list[tuple[str, _Order]], - *, - options: SelectKOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def select_k_unstable(*args, **kwargs): - """ - Select the indices of the first `k` ordered elements from the input. - - This function selects an array of indices of the first `k` ordered elements - from the `input` array, record batch or table specified in the column keys - (`options.sort_keys`). Output is not guaranteed to be stable. - Null values are considered greater than any other value and are - therefore ordered at the end. For floating-point types, NaNs are considered - greater than any other non-null value, but smaller than null values. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - k : int - Number of leading values to select in sorted order - (i.e. the largest values if sort order is "descending", - the smallest otherwise). - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - options : pyarrow.compute.SelectKOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def sort_indices( - input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, - /, - sort_keys: Sequence[tuple[str, _Order]] = (), - *, - null_placement: _Placement = "at_end", - options: SortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.UInt64Array: ... -@overload -def sort_indices( - input: Expression, - /, - sort_keys: Sequence[tuple[str, _Order]] = (), - *, - null_placement: _Placement = "at_end", - options: SortOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def sort_indices(*args, **kwargs): - """ - Return the indices that would sort an array, record batch or table. - - This function computes an array of indices that define a stable sort - of the input array, record batch or table. By default, null values are - considered greater than any other value and are therefore sorted at the - end of the input. For floating-point types, NaNs are considered greater - than any other non-null value, but smaller than null values. - - The handling of nulls and NaNs can be changed in SortOptions. - - Parameters - ---------- - input : Array-like or scalar-like - Argument to compute function. - sort_keys : sequence of (name, order) tuples - Names of field/column keys to sort the input on, - along with the order each field/column is sorted in. - Accepted values for `order` are "ascending", "descending". - The field name can be a string column name or expression. - null_placement : str, default "at_end" - Where nulls in input should be sorted, only applying to - columns/fields mentioned in `sort_keys`. - Accepted values are "at_start", "at_end". - options : pyarrow.compute.SortOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.6 Structural transforms ========================= -@overload -def list_element( - lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def list_element( - lists: lib.Array[ListScalar[_DataTypeT]], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.Array[lib.Scalar[_DataTypeT]]: ... -@overload -def list_element( - lists: lib.ChunkedArray[ListScalar[_DataTypeT]], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... -@overload -def list_element( - lists: ListScalar[_DataTypeT], - index: ScalarLike, - /, - *, - memory_pool: lib.MemoryPool | None = None, -) -> _DataTypeT: ... -def list_element(*args, **kwargs): - """ - Compute elements using of nested list values using an index. - - `lists` must have a list-like type. - For each value in each list of `lists`, the element at `index` - is emitted. Null values emit a null in the output. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - index : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_flatten( - lists: Expression, - /, - recursive: bool = False, - *, - options: ListFlattenOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def list_flatten( - lists: ArrayOrChunkedArray[ListScalar[Any]], - /, - recursive: bool = False, - *, - options: ListFlattenOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any]: ... -def list_flatten(*args, **kwargs): - """ - Flatten list values. - - `lists` must have a list-like type (lists, list-views, and - fixed-size lists). - Return an array with the top list level flattened unless - `recursive` is set to true in ListFlattenOptions. When that - is that case, flattening happens recursively until a non-list - array is formed. - - Null list values do not emit anything to the output. - - Parameters - ---------- - lists : Array-like - Argument to compute function. - recursive : bool, default False - When True, the list array is flattened recursively until an array - of non-list values is formed. - options : pyarrow.compute.ListFlattenOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_parent_indices( - lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None -) -> Expression: ... -@overload -def list_parent_indices( - lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None -) -> lib.Int64Array: ... -def list_parent_indices(*args, **kwargs): - """ - Compute parent indices of nested list values. - - `lists` must have a list-like or list-view type. - For each value in each list of `lists`, the top-level list index - is emitted. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -@overload -def list_slice( - lists: Expression, - /, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - *, - options: ListSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -@overload -def list_slice( - lists: ArrayOrChunkedArray[Any], - /, - start: int, - stop: int | None = None, - step: int = 1, - return_fixed_size_list: bool | None = None, - *, - options: ListSliceOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> lib.ListArray[Any]: ... -def list_slice(*args, **kwargs): - """ - Compute slice of list-like array. - - `lists` must have a list-like type. - For each list element, compute a slice, returning a new list array. - A variable or fixed size list array is returned, depending on options. - - Parameters - ---------- - lists : Array-like or scalar-like - Argument to compute function. - start : int - Index to start slicing inner list elements (inclusive). - stop : Optional[int], default None - If given, index to stop slicing at (exclusive). - If not given, slicing will stop at the end. (NotImplemented) - step : int, default 1 - Slice step. - return_fixed_size_list : Optional[bool], default None - Whether to return a FixedSizeListArray. If true _and_ stop is after - a list element's length, nulls will be appended to create the - requested slice size. The default of `None` will return the same - type which was passed in. - options : pyarrow.compute.ListSliceOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def map_lookup( - container, - /, - query_key, - occurrence: str, - *, - options: MapLookupOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -): - """ - Find the items corresponding to a given key in a Map. - - For a given query key (passed via MapLookupOptions), extract - either the FIRST, LAST or ALL items from a Map that have - matching keys. - - Parameters - ---------- - container : Array-like or scalar-like - Argument to compute function. - query_key : Scalar or Object can be converted to Scalar - The key to search for. - occurrence : str - The occurrence(s) to return from the Map - Accepted values are "first", "last", or "all". - options : pyarrow.compute.MapLookupOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def struct_field( - values, - /, - indices, - *, - options: StructFieldOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -): - """ - Extract children of a struct or union by index. - - Given a list of indices (passed via StructFieldOptions), extract - the child array or scalar with the given child index, recursively. - - For union inputs, nulls are emitted for union values that reference - a different child than specified. Also, the indices are always - in physical order, not logical type codes - for example, the first - child is always index 0. - - An empty list of indices returns the argument unchanged. - - Parameters - ---------- - values : Array-like or scalar-like - Argument to compute function. - indices : List[str], List[bytes], List[int], Expression, bytes, str, or int - List of indices for chained field lookup, for example `[4, 1]` - will look up the second nested field in the fifth outer field. - options : pyarrow.compute.StructFieldOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values backward to fill null slots. - - Given an array, propagate next valid observation backward to previous valid - or nothing if all next values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): - """ - Carry non-null values forward to fill null slots. - - Given an array, propagate last valid observation forward to next valid - or nothing if all previous values are null. - - Parameters - ---------- - values : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -def replace_with_mask( - values, - mask: list[bool] | list[bool | None] | BooleanArray, - replacements, - /, - *, - memory_pool: lib.MemoryPool | None = None, -): - """ - Replace items selected with a mask. - - Given an array and a boolean mask (either scalar or of equal length), - along with replacement values (either scalar or array), - each element of the array for which the corresponding mask element is - true will be replaced by the next value from the replacements, - or with null if the mask is null. - Hence, for replacement arrays, len(replacements) == sum(mask == true). - - Parameters - ---------- - values : Array-like - Argument to compute function. - mask : Array-like - Argument to compute function. - replacements : Array-like - Argument to compute function. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -# ========================= 3.7 Pairwise functions ========================= -@overload -def pairwise_diff( - input: _NumericOrTemporalArrayT, - /, - period: int = 1, - *, - options: PairwiseOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> _NumericOrTemporalArrayT: ... -@overload -def pairwise_diff( - input: Expression, - /, - period: int = 1, - *, - options: PairwiseOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> Expression: ... -def pairwise_diff(*args, **kwargs): - """ - Compute first order difference of an array. - - Computes the first order difference of an array, It internally calls - the scalar function "subtract" to compute - differences, so its - behavior and supported types are the same as - "subtract". The period can be specified in :struct:`PairwiseOptions`. - - Results will wrap around on integer overflow. Use function - "pairwise_diff_checked" if you want overflow to return an error. - - Parameters - ---------- - input : Array-like - Argument to compute function. - period : int, default 1 - Period for applying the period function. - options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. - memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. - """ - -pairwise_diff_checked = _clone_signature(pairwise_diff) -""" -Compute first order difference of an array. - -Computes the first order difference of an array, It internally calls -the scalar function "subtract_checked" (or the checked variant) to compute -differences, so its behavior and supported types are the same as -"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. - -This function returns an error on overflow. For a variant that doesn't -fail on overflow, use function "pairwise_diff". - -Parameters ----------- -input : Array-like - Argument to compute function. -period : int, default 1 - Period for applying the period function. -options : pyarrow.compute.PairwiseOptions, optional - Alternative way of passing options. -memory_pool : pyarrow.MemoryPool, optional - If not passed, will allocate memory from the default memory pool. -""" diff --git a/pyarrow-stubs/csv.pyi b/pyarrow-stubs/csv.pyi deleted file mode 100644 index 510229d7e72..00000000000 --- a/pyarrow-stubs/csv.pyi +++ /dev/null @@ -1,27 +0,0 @@ -from pyarrow._csv import ( - ISO8601, - ConvertOptions, - CSVStreamingReader, - CSVWriter, - InvalidRow, - ParseOptions, - ReadOptions, - WriteOptions, - open_csv, - read_csv, - write_csv, -) - -__all__ = [ - "ISO8601", - "ConvertOptions", - "CSVStreamingReader", - "CSVWriter", - "InvalidRow", - "ParseOptions", - "ReadOptions", - "WriteOptions", - "open_csv", - "read_csv", - "write_csv", -] diff --git a/pyarrow-stubs/cuda.pyi b/pyarrow-stubs/cuda.pyi deleted file mode 100644 index e11baf7d4e7..00000000000 --- a/pyarrow-stubs/cuda.pyi +++ /dev/null @@ -1,25 +0,0 @@ -from pyarrow._cuda import ( - BufferReader, - BufferWriter, - Context, - CudaBuffer, - HostBuffer, - IpcMemHandle, - new_host_buffer, - read_message, - read_record_batch, - serialize_record_batch, -) - -__all__ = [ - "BufferReader", - "BufferWriter", - "Context", - "CudaBuffer", - "HostBuffer", - "IpcMemHandle", - "new_host_buffer", - "read_message", - "read_record_batch", - "serialize_record_batch", -] diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi deleted file mode 100644 index 98f1a38aa85..00000000000 --- a/pyarrow-stubs/dataset.pyi +++ /dev/null @@ -1,229 +0,0 @@ -from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload - -from _typeshed import StrPath -from pyarrow._dataset import ( - CsvFileFormat, - CsvFragmentScanOptions, - Dataset, - DatasetFactory, - DirectoryPartitioning, - FeatherFileFormat, - FileFormat, - FileFragment, - FilenamePartitioning, - FileSystemDataset, - FileSystemDatasetFactory, - FileSystemFactoryOptions, - FileWriteOptions, - Fragment, - FragmentScanOptions, - HivePartitioning, - InMemoryDataset, - IpcFileFormat, - IpcFileWriteOptions, - JsonFileFormat, - JsonFragmentScanOptions, - Partitioning, - PartitioningFactory, - Scanner, - TaggedRecordBatch, - UnionDataset, - UnionDatasetFactory, - WrittenFile, - get_partition_keys, -) -from pyarrow._dataset_orc import OrcFileFormat -from pyarrow._dataset_parquet import ( - ParquetDatasetFactory, - ParquetFactoryOptions, - ParquetFileFormat, - ParquetFileFragment, - ParquetFileWriteOptions, - ParquetFragmentScanOptions, - ParquetReadOptions, - RowGroupInfo, -) -from pyarrow._dataset_parquet_encryption import ( - ParquetDecryptionConfig, - ParquetEncryptionConfig, -) -from pyarrow.compute import Expression, field, scalar -from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table - -from ._fs import SupportedFileSystem - -_orc_available: bool -_parquet_available: bool - -__all__ = [ - "CsvFileFormat", - "CsvFragmentScanOptions", - "Dataset", - "DatasetFactory", - "DirectoryPartitioning", - "FeatherFileFormat", - "FileFormat", - "FileFragment", - "FilenamePartitioning", - "FileSystemDataset", - "FileSystemDatasetFactory", - "FileSystemFactoryOptions", - "FileWriteOptions", - "Fragment", - "FragmentScanOptions", - "HivePartitioning", - "InMemoryDataset", - "IpcFileFormat", - "IpcFileWriteOptions", - "JsonFileFormat", - "JsonFragmentScanOptions", - "Partitioning", - "PartitioningFactory", - "Scanner", - "TaggedRecordBatch", - "UnionDataset", - "UnionDatasetFactory", - "WrittenFile", - "get_partition_keys", - # Orc - "OrcFileFormat", - # Parquet - "ParquetDatasetFactory", - "ParquetFactoryOptions", - "ParquetFileFormat", - "ParquetFileFragment", - "ParquetFileWriteOptions", - "ParquetFragmentScanOptions", - "ParquetReadOptions", - "RowGroupInfo", - # Parquet Encryption - "ParquetDecryptionConfig", - "ParquetEncryptionConfig", - # Compute - "Expression", - "field", - "scalar", - # Dataset - "partitioning", - "parquet_dataset", - "write_dataset", -] - -_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] - -@overload -def partitioning( - schema: Schema, -) -> Partitioning: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["filename"], - dictionaries: dict[str, Array] | None = None, -) -> Partitioning: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["filename"], - dictionaries: Literal["infer"], -) -> PartitioningFactory: ... -@overload -def partitioning( - field_names: list[str], - *, - flavor: Literal["filename"], -) -> PartitioningFactory: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["hive"], - dictionaries: Literal["infer"], -) -> PartitioningFactory: ... -@overload -def partitioning( - *, - flavor: Literal["hive"], -) -> PartitioningFactory: ... -@overload -def partitioning( - schema: Schema, - *, - flavor: Literal["hive"], - dictionaries: dict[str, Array] | None = None, -) -> Partitioning: ... -def parquet_dataset( - metadata_path: StrPath, - schema: Schema | None = None, - filesystem: SupportedFileSystem | None = None, - format: ParquetFileFormat | None = None, - partitioning: Partitioning | PartitioningFactory | None = None, - partition_base_dir: str | None = None, -) -> FileSystemDataset: ... -@overload -def dataset( - source: StrPath | Sequence[StrPath], - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> FileSystemDataset: ... -@overload -def dataset( - source: list[Dataset], - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> UnionDataset: ... -@overload -def dataset( - source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> InMemoryDataset: ... -@overload -def dataset( - source: RecordBatch | Table, - schema: Schema | None = None, - format: FileFormat | _DatasetFormat | None = None, - filesystem: SupportedFileSystem | str | None = None, - partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, - partition_base_dir: str | None = None, - exclude_invalid_files: bool | None = None, - ignore_prefixes: list[str] | None = None, -) -> InMemoryDataset: ... -def write_dataset( - data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], - base_dir: StrPath, - *, - basename_template: str | None = None, - format: FileFormat | _DatasetFormat | None = None, - partitioning: Partitioning | list[str] | None = None, - partitioning_flavor: str | None = None, - schema: Schema | None = None, - filesystem: SupportedFileSystem | None = None, - file_options: FileWriteOptions | None = None, - use_threads: bool = True, - max_partitions: int = 1024, - max_open_files: int = 1024, - max_rows_per_file: int = 0, - min_rows_per_group: int = 0, - max_rows_per_group: int = 1024 * 1024, - file_visitor: Callable[[str], None] | None = None, - existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", - create_dir: bool = True, -): ... diff --git a/pyarrow-stubs/feather.pyi b/pyarrow-stubs/feather.pyi deleted file mode 100644 index 9451ee15763..00000000000 --- a/pyarrow-stubs/feather.pyi +++ /dev/null @@ -1,50 +0,0 @@ -from typing import IO, Literal - -import pandas as pd - -from _typeshed import StrPath -from pyarrow._feather import FeatherError -from pyarrow.lib import Table - -__all__ = [ - "FeatherError", - "FeatherDataset", - "check_chunked_overflow", - "write_feather", - "read_feather", - "read_table", -] - -class FeatherDataset: - path_or_paths: str | list[str] - validate_schema: bool - - def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... - def read_table(self, columns: list[str] | None = None) -> Table: ... - def validate_schemas(self, piece, table: Table) -> None: ... - def read_pandas( - self, columns: list[str] | None = None, use_threads: bool = True - ) -> pd.DataFrame: ... - -def check_chunked_overflow(name: str, col) -> None: ... -def write_feather( - df: pd.DataFrame | Table, - dest: StrPath | IO, - compression: Literal["zstd", "lz4", "uncompressed"] | None = None, - compression_level: int | None = None, - chunksize: int | None = None, - version: Literal[1, 2] = 2, -) -> None: ... -def read_feather( - source: StrPath | IO, - columns: list[str] | None = None, - use_threads: bool = True, - memory_map: bool = False, - **kwargs, -) -> pd.DataFrame: ... -def read_table( - source: StrPath | IO, - columns: list[str] | None = None, - memory_map: bool = False, - use_threads: bool = True, -) -> Table: ... diff --git a/pyarrow-stubs/flight.pyi b/pyarrow-stubs/flight.pyi deleted file mode 100644 index 9b806ccf305..00000000000 --- a/pyarrow-stubs/flight.pyi +++ /dev/null @@ -1,95 +0,0 @@ -from pyarrow._flight import ( - Action, - ActionType, - BasicAuth, - CallInfo, - CertKeyPair, - ClientAuthHandler, - ClientMiddleware, - ClientMiddlewareFactory, - DescriptorType, - FlightCallOptions, - FlightCancelledError, - FlightClient, - FlightDataStream, - FlightDescriptor, - FlightEndpoint, - FlightError, - FlightInfo, - FlightInternalError, - FlightMetadataReader, - FlightMetadataWriter, - FlightMethod, - FlightServerBase, - FlightServerError, - FlightStreamChunk, - FlightStreamReader, - FlightStreamWriter, - FlightTimedOutError, - FlightUnauthenticatedError, - FlightUnauthorizedError, - FlightUnavailableError, - FlightWriteSizeExceededError, - GeneratorStream, - Location, - MetadataRecordBatchReader, - MetadataRecordBatchWriter, - RecordBatchStream, - Result, - SchemaResult, - ServerAuthHandler, - ServerCallContext, - ServerMiddleware, - ServerMiddlewareFactory, - Ticket, - TracingServerMiddlewareFactory, - connect, -) - -__all__ = [ - "Action", - "ActionType", - "BasicAuth", - "CallInfo", - "CertKeyPair", - "ClientAuthHandler", - "ClientMiddleware", - "ClientMiddlewareFactory", - "DescriptorType", - "FlightCallOptions", - "FlightCancelledError", - "FlightClient", - "FlightDataStream", - "FlightDescriptor", - "FlightEndpoint", - "FlightError", - "FlightInfo", - "FlightInternalError", - "FlightMetadataReader", - "FlightMetadataWriter", - "FlightMethod", - "FlightServerBase", - "FlightServerError", - "FlightStreamChunk", - "FlightStreamReader", - "FlightStreamWriter", - "FlightTimedOutError", - "FlightUnauthenticatedError", - "FlightUnauthorizedError", - "FlightUnavailableError", - "FlightWriteSizeExceededError", - "GeneratorStream", - "Location", - "MetadataRecordBatchReader", - "MetadataRecordBatchWriter", - "RecordBatchStream", - "Result", - "SchemaResult", - "ServerAuthHandler", - "ServerCallContext", - "ServerMiddleware", - "ServerMiddlewareFactory", - "Ticket", - "TracingServerMiddlewareFactory", - "connect", -] diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi deleted file mode 100644 index 6bf75616c13..00000000000 --- a/pyarrow-stubs/fs.pyi +++ /dev/null @@ -1,77 +0,0 @@ -from pyarrow._fs import ( # noqa - FileSelector, - FileType, - FileInfo, - FileSystem, - LocalFileSystem, - SubTreeFileSystem, - _MockFileSystem, - FileSystemHandler, - PyFileSystem, - SupportedFileSystem, -) -from pyarrow._azurefs import AzureFileSystem -from pyarrow._hdfs import HadoopFileSystem -from pyarrow._gcsfs import GcsFileSystem -from pyarrow._s3fs import ( # noqa - AwsDefaultS3RetryStrategy, - AwsStandardS3RetryStrategy, - S3FileSystem, - S3LogLevel, - S3RetryStrategy, - ensure_s3_initialized, - finalize_s3, - ensure_s3_finalized, - initialize_s3, - resolve_s3_region, -) - -FileStats = FileInfo - -def copy_files( - source: str, - destination: str, - source_filesystem: SupportedFileSystem | None = None, - destination_filesystem: SupportedFileSystem | None = None, - *, - chunk_size: int = 1024 * 1024, - use_threads: bool = True, -) -> None: ... - -class FSSpecHandler(FileSystemHandler): # type: ignore[misc] - fs: SupportedFileSystem - def __init__(self, fs: SupportedFileSystem) -> None: ... - -__all__ = [ - # _fs - "FileSelector", - "FileType", - "FileInfo", - "FileSystem", - "LocalFileSystem", - "SubTreeFileSystem", - "_MockFileSystem", - "FileSystemHandler", - "PyFileSystem", - # _azurefs - "AzureFileSystem", - # _hdfs - "HadoopFileSystem", - # _gcsfs - "GcsFileSystem", - # _s3fs - "AwsDefaultS3RetryStrategy", - "AwsStandardS3RetryStrategy", - "S3FileSystem", - "S3LogLevel", - "S3RetryStrategy", - "ensure_s3_initialized", - "finalize_s3", - "ensure_s3_finalized", - "initialize_s3", - "resolve_s3_region", - # fs - "FileStats", - "copy_files", - "FSSpecHandler", -] diff --git a/pyarrow-stubs/gandiva.pyi b/pyarrow-stubs/gandiva.pyi deleted file mode 100644 index a344f885b29..00000000000 --- a/pyarrow-stubs/gandiva.pyi +++ /dev/null @@ -1,65 +0,0 @@ -from typing import Iterable, Literal - -from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable - -class Node(_Weakrefable): - def return_type(self) -> DataType: ... - -class Expression(_Weakrefable): - def root(self) -> Node: ... - def result(self) -> Field: ... - -class Condition(_Weakrefable): - def root(self) -> Node: ... - def result(self) -> Field: ... - -class SelectionVector(_Weakrefable): - def to_array(self) -> Array: ... - -class Projector(_Weakrefable): - @property - def llvm_ir(self): ... - def evaluate( - self, batch: RecordBatch, selection: SelectionVector | None = None - ) -> list[Array]: ... - -class Filter(_Weakrefable): - @property - def llvm_ir(self): ... - def evaluate( - self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" - ) -> SelectionVector: ... - -class TreeExprBuilder(_Weakrefable): - def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... - def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... - def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... - def make_field(self, field: Field) -> Node: ... - def make_if( - self, condition: Node, this_node: Node, else_node: Node, return_type: DataType - ) -> Node: ... - def make_and(self, children: list[Node]) -> Node: ... - def make_or(self, children: list[Node]) -> Node: ... - def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... - def make_condition(self, condition: Node) -> Condition: ... - -class Configuration(_Weakrefable): - def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... - -def make_projector( - schema: Schema, - children: list[Expression], - pool: MemoryPool, - selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", - configuration: Configuration | None = None, -) -> Projector: ... -def make_filter( - schema: Schema, condition: Condition, configuration: Configuration | None = None -) -> Filter: ... - -class FunctionSignature(_Weakrefable): - def return_type(self) -> DataType: ... - def param_types(self) -> list[DataType]: ... - def name(self) -> str: ... - -def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/pyarrow-stubs/interchange/__init__.pyi b/pyarrow-stubs/interchange/__init__.pyi deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/pyarrow-stubs/interchange/buffer.pyi b/pyarrow-stubs/interchange/buffer.pyi deleted file mode 100644 index 46673961a75..00000000000 --- a/pyarrow-stubs/interchange/buffer.pyi +++ /dev/null @@ -1,58 +0,0 @@ -import enum - -from pyarrow.lib import Buffer - -class DlpackDeviceType(enum.IntEnum): - """Integer enum for device type codes matching DLPack.""" - - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 - -class _PyArrowBuffer: - """ - Data in the buffer is guaranteed to be contiguous in memory. - - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - """ - def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... - @property - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. - """ diff --git a/pyarrow-stubs/interchange/column.pyi b/pyarrow-stubs/interchange/column.pyi deleted file mode 100644 index e6662867b6b..00000000000 --- a/pyarrow-stubs/interchange/column.pyi +++ /dev/null @@ -1,252 +0,0 @@ -import enum - -from typing import Any, Iterable, TypeAlias, TypedDict - -from pyarrow.lib import Array, ChunkedArray - -from .buffer import _PyArrowBuffer - -class DtypeKind(enum.IntEnum): - """ - Integer enum for data types. - - Attributes - ---------- - INT : int - Matches to signed integer data type. - UINT : int - Matches to unsigned integer data type. - FLOAT : int - Matches to floating point data type. - BOOL : int - Matches to boolean data type. - STRING : int - Matches to string data type (UTF-8 encoded). - DATETIME : int - Matches to datetime data type. - CATEGORICAL : int - Matches to categorical data type. - """ - - INT = 0 - UINT = 1 - FLOAT = 2 - BOOL = 20 - STRING = 21 # UTF-8 - DATETIME = 22 - CATEGORICAL = 23 - -Dtype: TypeAlias = tuple[DtypeKind, int, str, str] - -class ColumnNullType(enum.IntEnum): - """ - Integer enum for null type representation. - - Attributes - ---------- - NON_NULLABLE : int - Non-nullable column. - USE_NAN : int - Use explicit float NaN value. - USE_SENTINEL : int - Sentinel value besides NaN. - USE_BITMASK : int - The bit is set/unset representing a null on a certain position. - USE_BYTEMASK : int - The byte is set/unset representing a null on a certain position. - """ - - NON_NULLABLE = 0 - USE_NAN = 1 - USE_SENTINEL = 2 - USE_BITMASK = 3 - USE_BYTEMASK = 4 - -class ColumnBuffers(TypedDict): - data: tuple[_PyArrowBuffer, Dtype] - validity: tuple[_PyArrowBuffer, Dtype] | None - offsets: tuple[_PyArrowBuffer, Dtype] | None - -class CategoricalDescription(TypedDict): - is_ordered: bool - is_dictionary: bool - categories: _PyArrowColumn | None - -class Endianness(enum.Enum): - LITTLE = "<" - BIG = ">" - NATIVE = "=" - NA = "|" - -class NoBufferPresent(Exception): - """Exception to signal that there is no requested buffer.""" - -class _PyArrowColumn: - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ - def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... - def size(self) -> int: - """ - Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - - Is a method rather than a property because it may cause a (potentially - expensive) computation for some dataframe implementations. - """ - @property - def offset(self) -> int: - """ - Offset of first element. - - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - """ - @property - def dtype(self) -> tuple[DtypeKind, int, str, str]: - """ - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - - Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. - Endianness : current only native endianness (``=``) is supported - - Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for - bit masks) or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the - future we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and - for categoricals. - - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding of - the categorical (e.g. an integer to string mapping), this can - be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, - decimal, and nested (list, struct, map, union) dtypes. - """ - @property - def describe_categorical(self) -> CategoricalDescription: - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate non-categorical Column encoding categorical - values. - - Raises TypeError if the dtype is not categorical - - Returns the dictionary with description on how to interpret the - data buffer: - - "is_ordered" : bool, whether the ordering of dictionary indices - is semantically meaningful. - - "is_dictionary" : bool, whether a mapping of - categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of - indices to category values (e.g. an array of - cat1, cat2, ...). None if not a dictionary-style - categorical. - - TBD: are there any other in-memory representations that are needed? - """ - @property - def describe_null(self) -> tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. - None otherwise. - """ - @property - def null_count(self) -> int: - """ - Number of null elements, if known. - - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ - @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the chunks. - - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - def get_buffers(self) -> ColumnBuffers: - """ - Return a dictionary containing the underlying buffers. - - The returned dictionary has the following contents: - - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ diff --git a/pyarrow-stubs/interchange/dataframe.pyi b/pyarrow-stubs/interchange/dataframe.pyi deleted file mode 100644 index 526a58926a9..00000000000 --- a/pyarrow-stubs/interchange/dataframe.pyi +++ /dev/null @@ -1,102 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import Any, Iterable, Sequence - -from pyarrow.interchange.column import _PyArrowColumn -from pyarrow.lib import RecordBatch, Table - -class _PyArrowDataFrame: - """ - A data frame class, with only the methods required by the interchange - protocol defined. - - A "data frame" represents an ordered collection of named columns. - A column's "name" must be a unique string. - Columns may be accessed by name or by position. - - This could be a public data frame class, or an object with the methods and - attributes defined on this DataFrame class could be returned from the - ``__dataframe__`` method of a public data frame class in a library adhering - to the dataframe interchange protocol specification. - """ - - def __init__( - self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True - ) -> None: ... - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> _PyArrowDataFrame: - """ - Construct a new exchange object, potentially changing the parameters. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN``. - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this - protocol specifies contiguous buffers. - """ - @property - def metadata(self) -> dict[str, Any]: - """ - The metadata for the data frame, as a dictionary with string keys. The - contents of `metadata` may be anything, they are meant for a library - to store information that it needs to, e.g., roundtrip losslessly or - for two implementations to share data that is not (yet) part of the - interchange protocol specification. For avoiding collisions with other - entries, please add name the keys with the name of the library - followed by a period and the desired name, e.g, ``pandas.indexcol``. - """ - def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - """ - def num_rows(self) -> int: - """ - Return the number of rows in the DataFrame, if available. - """ - def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - """ - def column_names(self) -> Iterable[str]: - """ - Return an iterator yielding the column names. - """ - def get_column(self, i: int) -> _PyArrowColumn: - """ - Return the column at the indicated position. - """ - def get_column_by_name(self, name: str) -> _PyArrowColumn: - """ - Return the column whose name is the indicated name. - """ - def get_columns(self) -> Iterable[_PyArrowColumn]: - """ - Return an iterator yielding the columns. - """ - def select_columns(self, indices: Sequence[int]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by index. - """ - def select_columns_by_name(self, names: Sequence[str]) -> Self: - """ - Create a new DataFrame by selecting a subset of columns by name. - """ - def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: - """ - Return an iterator yielding the chunks. - - By default (None), yields the chunks that the data is stored as by the - producer. If given, ``n_chunks`` must be a multiple of - ``self.num_chunks()``, meaning the producer must subdivide each chunk - before yielding it. - - Note that the producer must ensure that all columns are chunked the - same way. - """ diff --git a/pyarrow-stubs/interchange/from_dataframe.pyi b/pyarrow-stubs/interchange/from_dataframe.pyi deleted file mode 100644 index b04b6268975..00000000000 --- a/pyarrow-stubs/interchange/from_dataframe.pyi +++ /dev/null @@ -1,244 +0,0 @@ -from typing import Any, Protocol, TypeAlias - -from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table - -from .column import ( - ColumnBuffers, - ColumnNullType, - Dtype, - DtypeKind, -) - -class DataFrameObject(Protocol): - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... - -ColumnObject: TypeAlias = Any - -def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: - """ - Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. - - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Table - - Examples - -------- - >>> import pyarrow - >>> from pyarrow.interchange import from_dataframe - - Convert a pandas dataframe to a pyarrow table: - - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "n_attendees": [100, 10, 1], - ... "country": ["Italy", "Spain", "Slovenia"], - ... } - ... ) - >>> df - n_attendees country - 0 100 Italy - 1 10 Spain - 2 1 Slovenia - >>> from_dataframe(df) - pyarrow.Table - n_attendees: int64 - country: large_string - ---- - n_attendees: [[100,10,1]] - country: [["Italy","Spain","Slovenia"]] - """ - -def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: - """ - Convert interchange protocol chunk to ``pa.RecordBatch``. - - Parameters - ---------- - df : DataFrameObject - Object supporting the interchange protocol, i.e. `__dataframe__` - method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.RecordBatch - """ - -def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding one of the primitive dtypes to a PyArrow array. - A primitive type is one of: int, uint, float, bool (1 bit). - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ - -def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: - """ - Convert a column holding boolean dtype to a PyArrow array. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - """ - -def categorical_column_to_dictionary( - col: ColumnObject, allow_copy: bool = True -) -> DictionaryArray: - """ - Convert a column holding categorical data to a pa.DictionaryArray. - - Parameters - ---------- - col : ColumnObject - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.DictionaryArray - """ - -def parse_datetime_format_str(format_str: str) -> tuple[str, str]: - """Parse datetime `format_str` to interpret the `data`.""" - -def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: - """Map column date type to pyarrow date type.""" - -def buffers_to_array( - buffers: ColumnBuffers, - data_type: tuple[DtypeKind, int, str, str], - length: int, - describe_null: ColumnNullType, - offset: int = 0, - allow_copy: bool = True, -) -> Array: - """ - Build a PyArrow array from the passed buffer. - - Parameters - ---------- - buffer : ColumnBuffers - Dictionary containing tuples of underlying buffers and - their associated dtype. - data_type : Tuple[DtypeKind, int, str, str], - Dtype description of the column as a tuple ``(kind, bit-width, format string, - endianness)``. - length : int - The number of values in the array. - describe_null: ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Array - - Notes - ----- - The returned array doesn't own the memory. The caller of this function - is responsible for keeping the memory owner object alive as long as - the returned PyArrow array is being used. - """ - -def validity_buffer_from_mask( - validity_buff: Buffer, - validity_dtype: Dtype, - describe_null: ColumnNullType, - length: int, - offset: int = 0, - allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from the passed mask buffer. - - Parameters - ---------- - validity_buff : BufferObject - Tuple of underlying validity buffer and associated dtype. - validity_dtype : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ - -def validity_buffer_nan_sentinel( - data_pa_buffer: Buffer, - data_type: Dtype, - describe_null: ColumnNullType, - length: int, - offset: int = 0, - allow_copy: bool = True, -) -> Buffer: - """ - Build a PyArrow buffer from NaN or sentinel values. - - Parameters - ---------- - data_pa_buffer : pa.Buffer - PyArrow buffer for the column data. - data_type : Dtype - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - describe_null : ColumnNullType - Null representation the column dtype uses, - as a tuple ``(kind, value)`` - length : int - The number of values in the array. - offset : int, default: 0 - Number of elements to offset from the start of the buffer. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - pa.Buffer - """ diff --git a/pyarrow-stubs/ipc.pyi b/pyarrow-stubs/ipc.pyi deleted file mode 100644 index c7f2af004d4..00000000000 --- a/pyarrow-stubs/ipc.pyi +++ /dev/null @@ -1,123 +0,0 @@ -from io import IOBase - -import pandas as pd -import pyarrow.lib as lib - -from pyarrow.lib import ( - IpcReadOptions, - IpcWriteOptions, - Message, - MessageReader, - MetadataVersion, - ReadStats, - RecordBatchReader, - WriteStats, - _ReadPandasMixin, - get_record_batch_size, - get_tensor_size, - read_message, - read_record_batch, - read_schema, - read_tensor, - write_tensor, -) - -class RecordBatchStreamReader(lib._RecordBatchStreamReader): - def __init__( - self, - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): - def __init__( - self, - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, - ) -> None: ... - -class RecordBatchFileReader(lib._RecordBatchFileReader): - def __init__( - self, - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - footer_offset: int | None = None, - *, - options: IpcReadOptions | None, - memory_pool: lib.MemoryPool | None = None, - ) -> None: ... - -class RecordBatchFileWriter(lib._RecordBatchFileWriter): - def __init__( - self, - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, - ) -> None: ... - -def new_stream( - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, -) -> RecordBatchStreamWriter: ... -def open_stream( - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> RecordBatchStreamReader: ... -def new_file( - sink: str | lib.NativeFile | IOBase, - schema: lib.Schema, - *, - use_legacy_format: bool | None = None, - options: IpcWriteOptions | None = None, -) -> RecordBatchFileWriter: ... -def open_file( - source: bytes | lib.Buffer | lib.NativeFile | IOBase, - footer_offset: int | None = None, - *, - options: IpcReadOptions | None = None, - memory_pool: lib.MemoryPool | None = None, -) -> RecordBatchFileReader: ... -def serialize_pandas( - df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None -) -> lib.Buffer: ... -def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... - -__all__ = [ - "IpcReadOptions", - "IpcWriteOptions", - "Message", - "MessageReader", - "MetadataVersion", - "ReadStats", - "RecordBatchReader", - "WriteStats", - "_ReadPandasMixin", - "get_record_batch_size", - "get_tensor_size", - "read_message", - "read_record_batch", - "read_schema", - "read_tensor", - "write_tensor", - "RecordBatchStreamReader", - "RecordBatchStreamWriter", - "RecordBatchFileReader", - "RecordBatchFileWriter", - "new_stream", - "open_stream", - "new_file", - "open_file", - "serialize_pandas", - "deserialize_pandas", -] diff --git a/pyarrow-stubs/json.pyi b/pyarrow-stubs/json.pyi deleted file mode 100644 index db1d35e0b8b..00000000000 --- a/pyarrow-stubs/json.pyi +++ /dev/null @@ -1,3 +0,0 @@ -from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json - -__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/pyarrow-stubs/lib.pyi b/pyarrow-stubs/lib.pyi deleted file mode 100644 index 1698b55520b..00000000000 --- a/pyarrow-stubs/lib.pyi +++ /dev/null @@ -1,106 +0,0 @@ -# ruff: noqa: F403 -from typing import NamedTuple - -from .__lib_pxi.array import * -from .__lib_pxi.benchmark import * -from .__lib_pxi.builder import * -from .__lib_pxi.compat import * -from .__lib_pxi.config import * -from .__lib_pxi.device import * -from .__lib_pxi.error import * -from .__lib_pxi.io import * -from .__lib_pxi.ipc import * -from .__lib_pxi.memory import * -from .__lib_pxi.pandas_shim import * -from .__lib_pxi.scalar import * -from .__lib_pxi.table import * -from .__lib_pxi.tensor import * -from .__lib_pxi.types import * - -class MonthDayNano(NamedTuple): - days: int - months: int - nanoseconds: int - -def cpu_count() -> int: - """ - Return the number of threads to use in parallel operations. - - The number of threads is determined at startup by inspecting the - ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. - If neither is present, it will default to the number of hardware threads - on the system. It can be modified at runtime by calling - :func:`set_cpu_count()`. - - See Also - -------- - set_cpu_count : Modify the size of this pool. - io_thread_count : The analogous function for the I/O thread pool. - """ - -def set_cpu_count(count: int) -> None: - """ - Set the number of threads to use in parallel operations. - - Parameters - ---------- - count : int - The number of concurrent threads that should be used. - - See Also - -------- - cpu_count : Get the size of this pool. - set_io_thread_count : The analogous function for the I/O thread pool. - """ - -def is_threading_enabled() -> bool: - """ - Returns True if threading is enabled in libarrow. - - If it isn't enabled, then python shouldn't create any - threads either, because we're probably on a system where - threading doesn't work (e.g. Emscripten). - """ - -Type_NA: int -Type_BOOL: int -Type_UINT8: int -Type_INT8: int -Type_UINT16: int -Type_INT16: int -Type_UINT32: int -Type_INT32: int -Type_UINT64: int -Type_INT64: int -Type_HALF_FLOAT: int -Type_FLOAT: int -Type_DOUBLE: int -Type_DECIMAL128: int -Type_DECIMAL256: int -Type_DATE32: int -Type_DATE64: int -Type_TIMESTAMP: int -Type_TIME32: int -Type_TIME64: int -Type_DURATION: int -Type_INTERVAL_MONTH_DAY_NANO: int -Type_BINARY: int -Type_STRING: int -Type_LARGE_BINARY: int -Type_LARGE_STRING: int -Type_FIXED_SIZE_BINARY: int -Type_BINARY_VIEW: int -Type_STRING_VIEW: int -Type_LIST: int -Type_LARGE_LIST: int -Type_LIST_VIEW: int -Type_LARGE_LIST_VIEW: int -Type_MAP: int -Type_FIXED_SIZE_LIST: int -Type_STRUCT: int -Type_SPARSE_UNION: int -Type_DENSE_UNION: int -Type_DICTIONARY: int -Type_RUN_END_ENCODED: int -UnionMode_SPARSE: int -UnionMode_DENSE: int diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi deleted file mode 100644 index 2eba8d40a11..00000000000 --- a/pyarrow-stubs/orc.pyi +++ /dev/null @@ -1,279 +0,0 @@ -import sys - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import IO, Literal - -from _typeshed import StrPath - -from . import _orc -from ._fs import SupportedFileSystem -from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table - -class ORCFile: - """ - Reader interface for a single ORC file - - Parameters - ---------- - source : str or pyarrow.NativeFile - Readable source. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. - """ - - reader: _orc.ORCReader - def __init__(self, source: StrPath | NativeFile | IO) -> None: ... - @property - def metadata(self) -> KeyValueMetadata: - """The file metadata, as an arrow KeyValueMetadata""" - @property - def schema(self) -> Schema: - """The file schema, as an arrow schema""" - @property - def nrows(self) -> int: - """The number of rows in the file""" - @property - def nstripes(self) -> int: - """The number of stripes in the file""" - @property - def file_version(self) -> str: - """Format version of the ORC file, must be 0.11 or 0.12""" - @property - def software_version(self) -> str: - """Software instance and version that wrote this file""" - @property - def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: - """Compression codec of the file""" - @property - def compression_size(self) -> int: - """Number of bytes to buffer for the compression codec in the file""" - @property - def writer(self) -> str: - """Name of the writer that wrote this file. - If the writer is unknown then its Writer ID - (a number) is returned""" - @property - def writer_version(self) -> str: - """Version of the writer""" - @property - def row_index_stride(self) -> int: - """Number of rows per an entry in the row index or 0 - if there is no row index""" - @property - def nstripe_statistics(self) -> int: - """Number of stripe statistics""" - @property - def content_length(self) -> int: - """Length of the data stripes in the file in bytes""" - @property - def stripe_statistics_length(self) -> int: - """The number of compressed bytes in the file stripe statistics""" - @property - def file_footer_length(self) -> int: - """The number of compressed bytes in the file footer""" - @property - def file_postscript_length(self) -> int: - """The number of bytes in the file postscript""" - @property - def file_length(self) -> int: - """The number of bytes in the file""" - def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: - """Read a single stripe from the file. - - Parameters - ---------- - n : int - The stripe index - columns : list - If not None, only these columns will be read from the stripe. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e' - - Returns - ------- - pyarrow.RecordBatch - Content of the stripe as a RecordBatch. - """ - def read(self, columns: list[str] | None = None) -> Table: - """Read the whole file. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. Output always follows the - ordering of the file and not the `columns` list. - - Returns - ------- - pyarrow.Table - Content of the file as a Table. - """ - -class ORCWriter: - """ - Writer interface for a single ORC file - - Parameters - ---------- - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ - - writer: _orc.ORCWriter - is_open: bool - def __init__( - self, - where: StrPath | NativeFile | IO, - *, - file_version: str = "0.12", - batch_size: int = 1024, - stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", - compression_block_size: int = 65536, - compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", - row_index_stride: int = 10000, - padding_tolerance: float = 0.0, - dictionary_key_size_threshold: float = 0.0, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float = 0.05, - ): ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> None: ... - def write(self, table: Table) -> None: - """ - Write the table into an ORC file. The schema of the table must - be equal to the schema used when opening the ORC file. - - Parameters - ---------- - table : pyarrow.Table - The table to be written into the ORC file - """ - def close(self) -> None: - """ - Close the ORC file - """ - -def read_table( - source: StrPath | NativeFile | IO, - columns: list[str] | None = None, - filesystem: SupportedFileSystem | None = None, -) -> Table: - """ - Read a Table from an ORC file. - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name. For file-like objects, - only read a single file. Use pyarrow.BufferReader to read a file - contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. Output always follows the ordering of the file and - not the `columns` list. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - """ - -def write_table( - table: Table, - where: StrPath | NativeFile | IO, - *, - file_version: str = "0.12", - batch_size: int = 1024, - stripe_size: int = 64 * 1024 * 1024, - compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", - compression_block_size: int = 65536, - compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", - row_index_stride: int = 10000, - padding_tolerance: float = 0.0, - dictionary_key_size_threshold: float = 0.0, - bloom_filter_columns: list[int] | None = None, - bloom_filter_fpp: float = 0.05, -) -> None: - """ - Write a table into an ORC file. - - Parameters - ---------- - table : pyarrow.lib.Table - The table to be written into the ORC file - where : str or pyarrow.io.NativeFile - Writable target. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream - or pyarrow.io.FixedSizeBufferWriter. - file_version : {"0.11", "0.12"}, default "0.12" - Determine which ORC file version to use. - `Hive 0.11 / ORC v0 `_ - is the older version - while `Hive 0.12 / ORC v1 `_ - is the newer one. - batch_size : int, default 1024 - Number of rows the ORC writer writes at a time. - stripe_size : int, default 64 * 1024 * 1024 - Size of each ORC stripe in bytes. - compression : string, default 'uncompressed' - The compression codec. - Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} - Note that LZ0 is currently not supported. - compression_block_size : int, default 64 * 1024 - Size of each compression block in bytes. - compression_strategy : string, default 'speed' - The compression strategy i.e. speed vs size reduction. - Valid values: {'SPEED', 'COMPRESSION'} - row_index_stride : int, default 10000 - The row index stride i.e. the number of rows per - an entry in the row index. - padding_tolerance : double, default 0.0 - The padding tolerance. - dictionary_key_size_threshold : double, default 0.0 - The dictionary key size threshold. 0 to disable dictionary encoding. - 1 to always enable dictionary encoding. - bloom_filter_columns : None, set-like or list-like, default None - Columns that use the bloom filter. - bloom_filter_fpp : double, default 0.05 - Upper limit of the false-positive rate of the bloom filter. - """ diff --git a/pyarrow-stubs/pandas_compat.pyi b/pyarrow-stubs/pandas_compat.pyi deleted file mode 100644 index efbd05ac2fe..00000000000 --- a/pyarrow-stubs/pandas_compat.pyi +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Any, TypedDict, TypeVar - -import numpy as np -import pandas as pd - -from pandas import DatetimeTZDtype - -from .lib import Array, DataType, Schema, Table - -_T = TypeVar("_T") - -def get_logical_type_map() -> dict[int, str]: ... -def get_logical_type(arrow_type: DataType) -> str: ... -def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... -def get_logical_type_from_numpy(pandas_collection) -> str: ... -def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... - -class _ColumnMetadata(TypedDict): - name: str - field_name: str - pandas_type: int - numpy_type: str - metadata: dict | None - -def get_column_metadata( - column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str -) -> _ColumnMetadata: ... -def construct_metadata( - columns_to_convert: list[pd.Series], - df: pd.DataFrame, - column_names: list[str], - index_levels: list[pd.Index], - index_descriptors: list[dict], - preserve_index: bool, - types: list[DataType], - column_field_names: list[str] = ..., -) -> dict[bytes, bytes]: ... -def dataframe_to_types( - df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None -) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... -def dataframe_to_arrays( - df: pd.DataFrame, - schema: Schema, - preserve_index: bool | None, - nthreads: int = 1, - columns: list[str] | None = None, - safe: bool = True, -) -> tuple[Array, Schema, int]: ... -def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... -def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... -def table_to_dataframe( - options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None -) -> pd.DataFrame: ... -def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/pyarrow-stubs/parquet/__init__.pyi b/pyarrow-stubs/parquet/__init__.pyi deleted file mode 100644 index 4ef88705809..00000000000 --- a/pyarrow-stubs/parquet/__init__.pyi +++ /dev/null @@ -1 +0,0 @@ -from .core import * # noqa diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi deleted file mode 100644 index 56b2c8447d9..00000000000 --- a/pyarrow-stubs/parquet/core.pyi +++ /dev/null @@ -1,2061 +0,0 @@ -import sys - -from pathlib import Path - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self -from typing import IO, Callable, Iterator, Literal, Sequence - -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from pyarrow import _parquet -from pyarrow._compute import Expression -from pyarrow._fs import FileSystem, SupportedFileSystem -from pyarrow._parquet import ( - ColumnChunkMetaData, - ColumnSchema, - FileDecryptionProperties, - FileEncryptionProperties, - FileMetaData, - ParquetLogicalType, - ParquetReader, - ParquetSchema, - RowGroupMetaData, - SortingColumn, - Statistics, -) -from pyarrow._stubs_typing import FilterTuple, SingleOrList -from pyarrow.dataset import ParquetFileFragment, Partitioning -from pyarrow.lib import NativeFile, RecordBatch, Schema, Table -from typing_extensions import deprecated - -__all__ = ( - "ColumnChunkMetaData", - "ColumnSchema", - "FileDecryptionProperties", - "FileEncryptionProperties", - "FileMetaData", - "ParquetDataset", - "ParquetFile", - "ParquetLogicalType", - "ParquetReader", - "ParquetSchema", - "ParquetWriter", - "RowGroupMetaData", - "SortingColumn", - "Statistics", - "read_metadata", - "read_pandas", - "read_schema", - "read_table", - "write_metadata", - "write_table", - "write_to_dataset", - "_filters_to_expression", - "filters_to_expression", -) - -def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: - """ - Check if filters are well-formed and convert to an ``Expression``. - - Parameters - ---------- - filters : List[Tuple] or List[List[Tuple]] - - Notes - ----- - See internal ``pyarrow._DNF_filter_doc`` attribute for more details. - - Examples - -------- - - >>> filters_to_expression([("foo", "==", "bar")]) - - - Returns - ------- - pyarrow.compute.Expression - An Expression representing the filters - """ - -@deprecated("use filters_to_expression") -def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... - -_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] - -class ParquetFile: - """ - Reader interface for a single Parquet file. - - Parameters - ---------- - source : str, pathlib.Path, pyarrow.NativeFile, or file-like object - Readable source. For passing bytes or buffer-like file containing a - Parquet file, use pyarrow.BufferReader. - metadata : FileMetaData, default None - Use existing metadata object, rather than reading from file. - common_metadata : FileMetaData, default None - Will be used in reads for pandas schema metadata if not found in the - main file's metadata, no other uses at the moment. - read_dictionary : list - List of column names to read directly as DictionaryArray. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - pre_buffer : bool, default False - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties, default None - File decryption properties for Parquet Modular Encryption. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Examples - -------- - - Generate an example PyArrow Table and write it to Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Create a ``ParquetFile`` object from the Parquet file: - - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the data: - - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] - - Create a ParquetFile object with "animal" column as DictionaryArray: - - >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) - >>> parquet_file.read() - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [ -- dictionary: - ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: - [0,1,2,3,4,5]] - """ - - reader: ParquetReader - common_metadata: FileMetaData - - def __init__( - self, - source: str | Path | NativeFile | IO, - *, - metadata: FileMetaData | None = None, - common_metadata: FileMetaData | None = None, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - pre_buffer: bool = False, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - filesystem: SupportedFileSystem | None = None, - page_checksum_verification: bool = False, - ): ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> None: ... - @property - def metadata(self) -> FileMetaData: - """ - Return the Parquet metadata. - """ - @property - def schema(self) -> ParquetSchema: - """ - Return the Parquet schema, unconverted to Arrow types - """ - @property - def schema_arrow(self) -> Schema: - """ - Return the inferred Arrow schema, converted from the whole Parquet - file's schema - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read the Arrow schema: - - >>> parquet_file.schema_arrow - n_legs: int64 - animal: string - """ - @property - def num_row_groups(self) -> int: - """ - Return the number of row groups of the Parquet file. - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.num_row_groups - 1 - """ - def close(self, force: bool = False) -> None: ... - @property - def closed(self) -> bool: ... - def read_row_group( - self, - i: int, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a single row group from a Parquet file. - - Parameters - ---------- - i : int - Index of the individual row group that we want to read. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row group as a table (of columns) - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_group(0) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,100]] - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ - def read_row_groups( - self, - row_groups: list, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a multiple row groups from a Parquet file. - - Parameters - ---------- - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the row group. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the row groups as a table (of columns). - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.read_row_groups([0, 0]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] - animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] - """ - def iter_batches( - self, - batch_size: int = 65536, - row_groups: list | None = None, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Iterator[RecordBatch]: - """ - Read streaming batches from a Parquet file. - - Parameters - ---------- - batch_size : int, default 64K - Maximum number of records to yield per batch. Batches may be - smaller if there aren't enough rows in the file. - row_groups : list - Only these row groups will be read from the file. - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : boolean, default True - Perform multi-threaded column reads. - use_pandas_metadata : boolean, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Yields - ------ - pyarrow.RecordBatch - Contents of each batch as a record batch - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - >>> for i in parquet_file.iter_batches(): - ... print("RecordBatch") - ... print(i.to_pandas()) - RecordBatch - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ - def read( - self, - columns: list | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read a Table from Parquet format. - - Parameters - ---------- - columns : list - If not None, only these columns will be read from the file. A - column name may be a prefix of a nested field, e.g. 'a' will select - 'a.b', 'a.c', and 'a.d.e'. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.table.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example Parquet file: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - Read a Table: - - >>> parquet_file.read(columns=["animal"]) - pyarrow.Table - animal: string - ---- - animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] - """ - def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: - """ - Read contents of file for the given columns and batch size. - - Notes - ----- - This function's primary purpose is benchmarking. - The scan is executed on a single thread. - - Parameters - ---------- - columns : list of integers, default None - Select columns to read, if None scan all columns. - batch_size : int, default 64K - Number of rows to read at a time internally. - - Returns - ------- - num_rows : int - Number of rows in file - - Examples - -------- - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - >>> parquet_file = pq.ParquetFile("example.parquet") - - >>> parquet_file.scan_contents() - 6 - """ - -class ParquetWriter: - """ - Class for incrementally building a Parquet file for Arrow tables. - - Parameters - ---------- - where : path or file-like object - schema : pyarrow.Schema - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - writer_engine_version : unused - **options : dict - If options contains a key `metadata_collector` then the - corresponding value is assumed to be a list (or any object with - `.append` method) that will be filled with the file metadata instance - of the written file. - - Examples - -------- - Generate an example PyArrow Table and RecordBatch: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> batch = pa.record_batch( - ... [ - ... [2, 2, 4, 4, 5, 100], - ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... ], - ... names=["n_legs", "animal"], - ... ) - - create a ParquetWriter object: - - >>> import pyarrow.parquet as pq - >>> writer = pq.ParquetWriter("example.parquet", table.schema) - - and write the Table into the Parquet file: - - >>> writer.write_table(table) - >>> writer.close() - - >>> pq.read_table("example.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - - create a ParquetWriter object for the RecordBatch: - - >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) - - and write the RecordBatch into the Parquet file: - - >>> writer2.write_batch(batch) - >>> writer2.close() - - >>> pq.read_table("example2.parquet").to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - 2 4 Dog - 3 4 Horse - 4 5 Brittle stars - 5 100 Centipede - """ - - flavor: str - schema_changed: bool - schema: ParquetSchema - where: str | Path | IO - file_handler: NativeFile | None - writer: _parquet.ParquetWriter - is_open: bool - - def __init__( - self, - where: str | Path | IO | NativeFile, - schema: Schema, - filesystem: SupportedFileSystem | None = None, - flavor: str | None = None, - version: Literal["1.0", "2.4", "2.6"] = ..., - use_dictionary: bool = True, - compression: _Compression | dict[str, _Compression] = "snappy", - write_statistics: bool | list = True, - use_deprecated_int96_timestamps: bool | None = None, - compression_level: int | dict | None = None, - use_byte_stream_split: bool | list = False, - column_encoding: str | dict | None = None, - writer_engine_version=None, - data_page_version: Literal["1.0", "2.0"] = ..., - use_compliant_nested_type: bool = True, - encryption_properties: FileEncryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: Sequence[SortingColumn] | None = None, - store_decimal_as_integer: bool = False, - **options, - ) -> None: ... - def __enter__(self) -> Self: ... - def __exit__(self, *args, **kwargs) -> Literal[False]: ... - def write( - self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None - ) -> None: - """ - Write RecordBatch or Table to the Parquet file. - - Parameters - ---------- - table_or_batch : {RecordBatch, Table} - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the input - table or batch length and 1024 * 1024. - """ - def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: - """ - Write RecordBatch to the Parquet file. - - Parameters - ---------- - batch : RecordBatch - row_group_size : int, default None - Maximum number of rows in written row group. If None, the - row group size will be the minimum of the RecordBatch - size and 1024 * 1024. If set larger than 64Mi then 64Mi - will be used instead. - """ - def write_table(self, table: Table, row_group_size: int | None = None) -> None: - """ - Write Table to the Parquet file. - - Parameters - ---------- - table : Table - row_group_size : int, default None - Maximum number of rows in each written row group. If None, - the row group size will be the minimum of the Table size - and 1024 * 1024. If set larger than 64Mi then 64Mi will - be used instead. - - """ - def close(self) -> None: - """ - Close the connection to the Parquet file. - """ - def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: - """ - Add key-value metadata to the file. - This will overwrite any existing metadata with the same key. - - Parameters - ---------- - key_value_metadata : dict - Keys and values must be string-like / coercible to bytes. - """ - -class ParquetDataset: - """ - Encapsulates details of reading a complete Parquet dataset possibly - consisting of multiple files and partitions in subdirectories. - - Parameters - ---------- - path_or_paths : str or List[str] - A directory name, single file name, or list of file names. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : pyarrow.parquet.Schema - Optionally provide the Schema for the Dataset, in which case it will - not be inferred from the source. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. Set to False if you want to prioritize minimal memory usage - over maximum speed. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular resolution - (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 - timestamps will be inferred as timestamps in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset("dataset_v2/") - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ - def __init__( - self, - path_or_paths: SingleOrList[str] - | SingleOrList[Path] - | SingleOrList[NativeFile] - | SingleOrList[IO], - filesystem: SupportedFileSystem | None = None, - schema: Schema | None = None, - *, - filters: Expression | FilterTuple | list[FilterTuple] | None = None, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - partitioning: str | list[str] | Partitioning | None = "hive", - ignore_prefixes: list[str] | None = None, - pre_buffer: bool = True, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, - ): ... - def equals(self, other: ParquetDataset) -> bool: ... - @property - def schema(self) -> Schema: - """ - Schema of the Dataset. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_schema/") - - Read the schema: - - >>> dataset.schema - n_legs: int64 - animal: string - year: dictionary - """ - def read( - self, - columns: list[str] | None = None, - use_threads: bool = True, - use_pandas_metadata: bool = False, - ) -> Table: - """ - Read (multiple) Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the dataset. The partition fields - are not automatically included. - use_threads : bool, default True - Perform multi-threaded column reads. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_read/") - - Read the dataset: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - def read_pandas(self, **kwargs) -> Table: - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to :func:`read`, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - Additional options for :func:`read` - - Examples - -------- - Generate an example parquet file: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "table_V2.parquet") - >>> dataset = pq.ParquetDataset("table_V2.parquet") - - Read the dataset with pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ - @property - def fragments(self) -> list[ParquetFileFragment]: - """ - A list of the Dataset source fragments or pieces with absolute - file paths. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") - - List the fragments: - - >>> dataset.fragments - [ list[str]: - """ - A list of absolute Parquet file paths in the Dataset source. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) - >>> dataset = pq.ParquetDataset("dataset_v2_files/") - - List the files: - - >>> dataset.files - ['dataset_v2_files/year=2019/...-0.parquet', ... - """ - @property - def filesystem(self) -> FileSystem: - """ - The filesystem type of the Dataset source. - """ - @property - def partitioning(self) -> Partitioning: - """ - The partitioning of the Dataset source, if discovered. - """ - -def read_table( - source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], - *, - columns: list | None = None, - use_threads: bool = True, - schema: Schema | None = None, - use_pandas_metadata: bool = False, - read_dictionary: list[str] | None = None, - memory_map: bool = False, - buffer_size: int = 0, - partitioning: str | list[str] | Partitioning | None = "hive", - filesystem: SupportedFileSystem | None = None, - filters: Expression | FilterTuple | list[FilterTuple] | None = None, - ignore_prefixes: list[str] | None = None, - pre_buffer: bool = True, - coerce_int96_timestamp_unit: str | None = None, - decryption_properties: FileDecryptionProperties | None = None, - thrift_string_size_limit: int | None = None, - thrift_container_size_limit: int | None = None, - page_checksum_verification: bool = False, -) -> Table: - """ - Read a Table from Parquet format - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - use_pandas_metadata : bool, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns) - - - Examples - -------- - - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) - - Read the data: - - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - - Read only a subset of columns: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) - pyarrow.Table - n_legs: int64 - animal: string - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] - - Read a subset of columns and read one column as DictionaryArray: - - >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) - pyarrow.Table - n_legs: int64 - animal: dictionary - ---- - n_legs: [[5],[2],[4,100],[2,4]] - animal: [ -- dictionary: - ["Brittle stars"] -- indices: - [0], -- dictionary: - ["Flamingo"] -- indices: - [0], -- dictionary: - ["Dog","Centipede"] -- indices: - [0,1], -- dictionary: - ["Parrot","Horse"] -- indices: - [0,1]] - - Read the table with filter: - - >>> pq.read_table( - ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] - ... ).to_pandas() - n_legs animal - 0 2 Flamingo - 1 2 Parrot - - Read data from a single Parquet file: - - >>> pq.write_table(table, "example.parquet") - >>> pq.read_table("dataset_name_2").to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - """ - -def read_pandas( - source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs -) -> Table: - """ - - Read a Table from Parquet format, also reading DataFrame - index values if known in the file metadata - - Parameters - ---------- - source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. - columns : list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e'. If empty, no columns will be read. Note - that the table will still have the correct num_rows set despite having - no columns. - use_threads : bool, default True - Perform multi-threaded column reads. - schema : Schema, optional - Optionally provide the Schema for the parquet dataset, in which case it - will not be inferred from the source. - read_dictionary : list, default None - List of names or column paths (for nested types) to read directly - as DictionaryArray. Only supported for BYTE_ARRAY storage. To read - a flat column as dictionary-encoded pass the column name. For - nested types, you must pass the full column "path", which could be - something like level1.level2.list.item. Refer to the Parquet - file's schema to obtain the paths. - memory_map : bool, default False - If the source is a file path, use a memory map to read file, which can - improve performance in some environments. - buffer_size : int, default 0 - If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered. - partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" - The partitioning scheme for a partitioned dataset. The default of "hive" - assumes directory names with key=value pairs like "/year=2009/month=11". - In addition, a scheme like "/2009/11" is also supported, in which case - you need to specify the field names or a full schema. See the - ``pyarrow.dataset.partitioning()`` function for more details. - **kwargs - additional options for :func:`read_table` - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None - Rows which do not match the filter predicate will be removed from scanned - data. Partition keys embedded in a nested directory structure will be - exploited to avoid loading files at all if they contain no matching rows. - Within-file level filtering and different partitioning schemes are supported. - - Predicates are expressed using an ``Expression`` or using - the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. - DNF allows arbitrary boolean logical combinations of single column predicates. - The innermost tuples each describe a single column predicate. The list of inner - predicates is interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines these filters - as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. - - Each tuple has format: (``key``, ``op``, ``value``) and compares the - ``key`` with the ``value``. - The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, - ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the - ``value`` must be a collection such as a ``list``, a ``set`` or a - ``tuple``. - - Examples: - - Using the ``Expression`` API: - - .. code-block:: python - - import pyarrow.compute as pc - pc.field('x') = 0 - pc.field('y').isin(['a', 'b', 'c']) - ~pc.field('y').isin({'a', 'b'}) - - Using the DNF format: - - .. code-block:: python - - ("x", "=", 0) - ("y", "in", ["a", "b", "c"]) - ("z", "not in", {"a", "b"}) - - - ignore_prefixes : list, optional - Files matching any of these prefixes will be ignored by the - discovery process. - This is matched to the basename of a path. - By default this is ['.', '_']. - Note that discovery happens only if a directory is passed as source. - pre_buffer : bool, default True - Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. If using a filesystem layer that itself - performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. - coerce_int96_timestamp_unit : str, default None - Cast timestamps that are stored in INT96 format to a particular - resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' - and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. - decryption_properties : FileDecryptionProperties or None - File-level decryption properties. - The decryption properties can be created using - ``CryptoFactory.file_decryption_properties()``. - thrift_string_size_limit : int, default None - If not None, override the maximum total string size allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - thrift_container_size_limit : int, default None - If not None, override the maximum total size of containers allocated - when decoding Thrift structures. The default limit should be - sufficient for most Parquet files. - page_checksum_verification : bool, default False - If True, verify the checksum for each page read from the file. - - Returns - ------- - pyarrow.Table - Content of the file as a Table of Columns, including DataFrame - indexes as columns - """ - -def write_table( - table: Table, - where: str | Path | NativeFile | IO, - row_group_size: int | None = None, - version: Literal["1.0", "2.4", "2.6"] = "2.6", - use_dictionary: bool = True, - compression: _Compression | dict[str, _Compression] = "snappy", - write_statistics: bool | list = True, - use_deprecated_int96_timestamps: bool | None = None, - coerce_timestamps: str | None = None, - allow_truncated_timestamps: bool = False, - data_page_size: int | None = None, - flavor: str | None = None, - filesystem: SupportedFileSystem | None = None, - compression_level: int | dict | None = None, - use_byte_stream_split: bool = False, - column_encoding: str | dict | None = None, - data_page_version: Literal["1.0", "2.0"] = ..., - use_compliant_nested_type: bool = True, - encryption_properties: FileEncryptionProperties | None = None, - write_batch_size: int | None = None, - dictionary_pagesize_limit: int | None = None, - store_schema: bool = True, - write_page_index: bool = False, - write_page_checksum: bool = False, - sorting_columns: Sequence[SortingColumn] | None = None, - store_decimal_as_integer: bool = False, - **kwargs, -) -> None: - """ - - Write a Table to Parquet format. - - Parameters - ---------- - table : pyarrow.Table - where : string or pyarrow.NativeFile - row_group_size : int - Maximum number of rows in each written row group. If None, the - row group size will be the minimum of the Table size and - 1024 * 1024. - version : {"1.0", "2.4", "2.6"}, default "2.6" - Determine which Parquet logical types are available for use, whether the - reduced set from the Parquet 1.x.x format or the expanded logical types - added in later format versions. - Files written with version='2.4' or '2.6' may not be readable in all - Parquet implementations, so version='1.0' is likely the choice that - maximizes file compatibility. - UINT32 and some logical types are only available with version '2.4'. - Nanosecond timestamps are only available with version '2.6'. - Other features such as compression algorithms or the new serialized - data page format must be enabled separately (see 'compression' and - 'data_page_version'). - use_dictionary : bool or list, default True - Specify if we should use dictionary encoding in general or only for - some columns. - When encoding the column, if the dictionary size is too large, the - column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type - doesn't support dictionary encoding. - compression : str or dict, default 'snappy' - Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. - write_statistics : bool or list, default True - Specify if we should write statistics in general (default is True) or only - for some columns. - use_deprecated_int96_timestamps : bool, default None - Write timestamps to INT96 Parquet format. Defaults to False unless enabled - by flavor argument. This take priority over the coerce_timestamps option. - coerce_timestamps : str, default None - Cast timestamps to a particular resolution. If omitted, defaults are chosen - depending on `version`. For ``version='1.0'`` and ``version='2.4'``, - nanoseconds are cast to microseconds ('us'), while for - ``version='2.6'`` (the default), they are written natively without loss - of resolution. Seconds are always cast to milliseconds ('ms') by default, - as Parquet does not have any temporal type with seconds resolution. - If the casting results in loss of data, it will raise an exception - unless ``allow_truncated_timestamps=True`` is given. - Valid values: {None, 'ms', 'us'} - allow_truncated_timestamps : bool, default False - Allow loss of data when coercing timestamps to a particular - resolution. E.g. if microsecond or nanosecond data is lost when coercing to - 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` - will NOT result in the truncation exception being ignored unless - ``coerce_timestamps`` is not None. - data_page_size : int, default None - Set a target threshold for the approximate encoded size of data - pages within a column chunk (in bytes). If None, use the default data page - size of 1MByte. - flavor : {'spark'}, default None - Sanitize schema or set other compatibility options to work with - various target systems. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - compression_level : int or dict, default None - Specify the compression level for a codec, either on a general basis or - per-column. If None is passed, arrow selects the compression level for - the compression codec in use. The compression level has a different - meaning for each codec, so you have to read the documentation of the - codec you are using. - An exception is thrown if the compression codec does not allow specifying - a compression level. - use_byte_stream_split : bool or list, default False - Specify if the byte_stream_split encoding should be used in general or - only for some columns. If both dictionary and byte_stream_stream are - enabled, then dictionary is preferred. - The byte_stream_split encoding is valid for integer, floating-point - and fixed-size binary data types (including decimals); it should be - combined with a compression codec so as to achieve size reduction. - column_encoding : string or dict, default None - Specify the encoding scheme on a per column basis. - Can only be used when ``use_dictionary`` is set to False, and - cannot be used in combination with ``use_byte_stream_split``. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. - Certain encodings are only compatible with certain data types. - Please refer to the encodings section of `Reading and writing Parquet - files `_. - data_page_version : {"1.0", "2.0"}, default "1.0" - The serialized Parquet data page format version to write, defaults to - 1.0. This does not impact the file schema logical types and Arrow to - Parquet type casting behavior; for that use the "version" option. - use_compliant_nested_type : bool, default True - Whether to write compliant Parquet nested type (lists) as defined - `here `_, defaults to ``True``. - For ``use_compliant_nested_type=True``, this will write into a list - with 3-level structure where the middle level, named ``list``, - is a repeated group with a single field named ``element``:: - - group (LIST) { - repeated group list { - element; - } - } - - For ``use_compliant_nested_type=False``, this will also write into a list - with 3-level structure, where the name of the single field of the middle - level ``list`` is taken from the element name for nested columns in Arrow, - which defaults to ``item``:: - - group (LIST) { - repeated group list { - item; - } - } - encryption_properties : FileEncryptionProperties, default None - File encryption properties for Parquet Modular Encryption. - If None, no encryption will be done. - The encryption properties can be created using: - ``CryptoFactory.file_encryption_properties()``. - write_batch_size : int, default None - Number of values to write to a page at a time. If None, use the default of - 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages - are exceeding the ``data_page_size`` due to large column values, lowering - the batch size can help keep page sizes closer to the intended size. - dictionary_pagesize_limit : int, default None - Specify the dictionary page size limit per row group. If None, use the - default 1MB. - store_schema : bool, default True - By default, the Arrow schema is serialized and stored in the Parquet - file metadata (in the "ARROW:schema" key). When reading the file, - if this key is available, it will be used to more faithfully recreate - the original Arrow data. For example, for tz-aware timestamp columns - it will restore the timezone (Parquet only stores the UTC values without - timezone), or columns with duration type will be restored from the int64 - Parquet column. - write_page_index : bool, default False - Whether to write a page index in general for all columns. - Writing statistics to the page index disables the old method of writing - statistics to each data page header. The page index makes statistics-based - filtering more efficient than the page header, as it gathers all the - statistics for a Parquet file in a single place, avoiding scattered I/O. - Note that the page index is not yet used on the read size by PyArrow. - write_page_checksum : bool, default False - Whether to write page checksums in general for all columns. - Page checksums enable detection of data corruption, which might occur during - transmission or in the storage. - sorting_columns : Sequence of SortingColumn, default None - Specify the sort order of the data being written. The writer does not sort - the data nor does it verify that the data is sorted. The sort order is - written to the row group metadata, which can then be used by readers. - store_decimal_as_integer : bool, default False - Allow decimals with 1 <= precision <= 18 to be stored as integers. - In Parquet, DECIMAL can be stored in any of the following physical types: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. - Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the - unscaled value is used. - - By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use the following physical types to store decimals: - - int32: for 1 <= precision <= 9. - - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: for precision > 18. - - As a consequence, decimal columns stored in integer types are more compact. - - **kwargs : optional - Additional options for ParquetWriter - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write the Table into Parquet file: - - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, "example.parquet") - - Defining row group size for the Parquet file: - - >>> pq.write_table(table, "example.parquet", row_group_size=3) - - Defining row group compression (default is Snappy): - - >>> pq.write_table(table, "example.parquet", compression="none") - - Defining row group compression and encoding per-column: - - >>> pq.write_table( - ... table, - ... "example.parquet", - ... compression={"n_legs": "snappy", "animal": "gzip"}, - ... use_dictionary=["n_legs", "animal"], - ... ) - - Defining column encoding per-column: - - >>> pq.write_table( - ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False - ... ) - """ - -def write_to_dataset( - table: Table, - root_path: str | Path, - partition_cols: list[str] | None = None, - filesystem: SupportedFileSystem | None = None, - schema: Schema | None = None, - partitioning: Partitioning | list[str] | None = None, - basename_template: str | None = None, - use_threads: bool | None = None, - file_visitor: Callable[[str], None] | None = None, - existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] - | None = None, - **kwargs, -) -> None: - """ - Wrapper around dataset.write_dataset for writing a Table to - Parquet format by partitions. - For each combination of partition columns and values, - a subdirectories are created in the following - manner: - - root_dir/ - group1=value1 - group2=value1 - .parquet - group2=value2 - .parquet - group1=valueN - group2=value1 - .parquet - group2=valueN - .parquet - - Parameters - ---------- - table : pyarrow.Table - root_path : str, pathlib.Path - The root directory of the dataset. - partition_cols : list, - Column names by which to partition the dataset. - Columns are partitioned in the order they are given. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - schema : Schema, optional - This Schema of the dataset. - partitioning : Partitioning or list[str], optional - The partitioning scheme specified with the - ``pyarrow.dataset.partitioning()`` function or a list of field names. - When providing a list of field names, you can use - ``partitioning_flavor`` to drive which partitioning type should be - used. - basename_template : str, optional - A template string used to generate basenames of written data files. - The token '{i}' will be replaced with an automatically incremented - integer. If not specified, it defaults to "guid-{i}.parquet". - use_threads : bool, default True - Write files in parallel. If enabled, then maximum parallelism will be - used determined by the number of available CPU cores. - file_visitor : function - If set, this function will be called with a WrittenFile instance - for each file created during the call. This object will have both - a path attribute and a metadata attribute. - - The path attribute will be a string containing the path to - the created file. - - The metadata attribute will be the parquet metadata of the file. - This metadata will have the file path attribute set and can be used - to build a _metadata file. The metadata attribute will be None if - the format is not parquet. - - Example visitor which simple collects the filenames created:: - - visited_paths = [] - - def file_visitor(written_file): - visited_paths.append(written_file.path) - - existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' - Controls how the dataset will handle data that already exists in - the destination. The default behaviour is 'overwrite_or_ignore'. - - 'overwrite_or_ignore' will ignore any existing data and will - overwrite files with the same name as an output file. Other - existing files will be ignored. This behavior, in combination - with a unique basename_template for each write, will allow for - an append workflow. - - 'error' will raise an error if any data exists in the destination. - - 'delete_matching' is useful when you are writing a partitioned - dataset. The first time each partition directory is encountered - the entire directory will be deleted. This allows you to overwrite - old partitions completely. - **kwargs : dict, - Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` - function for matching kwargs, and remainder to - :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. - See the docstring of :func:`write_table` and - :func:`pyarrow.dataset.write_dataset` for the available options. - Using `metadata_collector` in kwargs allows one to collect the - file metadata instances of dataset pieces. The file paths in the - ColumnChunkMetaData will be set relative to `root_path`. - - Examples - -------- - Generate an example PyArrow Table: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "year": [2020, 2022, 2021, 2022, 2019, 2021], - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - and write it to a partitioned dataset: - - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) - >>> pq.ParquetDataset("dataset_name_3").files - ['dataset_name_3/year=2019/...-0.parquet', ... - - Write a single Parquet file into the root folder: - - >>> pq.write_to_dataset(table, root_path="dataset_name_4") - >>> pq.ParquetDataset("dataset_name_4/").files - ['dataset_name_4/...-0.parquet'] - """ - -def write_metadata( - schema: Schema, - where: str | NativeFile, - metadata_collector: list[FileMetaData] | None = None, - filesystem: SupportedFileSystem | None = None, - **kwargs, -) -> None: - """ - Write metadata-only Parquet file from schema. This can be used with - `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar - files. - - Parameters - ---------- - schema : pyarrow.Schema - where : string or pyarrow.NativeFile - metadata_collector : list - where to collect metadata information. - filesystem : FileSystem, default None - If nothing passed, will be inferred from `where` if path-like, else - `where` is already a file-like object so no filesystem is needed. - **kwargs : dict, - Additional kwargs for ParquetWriter class. See docstring for - `ParquetWriter` for more information. - - Examples - -------- - Generate example data: - - >>> import pyarrow as pa - >>> table = pa.table( - ... { - ... "n_legs": [2, 2, 4, 4, 5, 100], - ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], - ... } - ... ) - - Write a dataset and collect metadata information. - - >>> metadata_collector = [] - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) - - Write the `_common_metadata` parquet file without row groups statistics. - - >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") - - Write the `_metadata` parquet file with row groups statistics. - - >>> pq.write_metadata( - ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector - ... ) - """ - -def read_metadata( - where: str | Path | IO | NativeFile, - memory_map: bool = False, - decryption_properties: FileDecryptionProperties | None = None, - filesystem: SupportedFileSystem | None = None, -) -> FileMetaData: - """ - Read FileMetaData from footer of a single Parquet file. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - metadata : FileMetaData - The metadata of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_metadata("example.parquet") - - created_by: parquet-cpp-arrow version ... - num_columns: 2 - num_rows: 3 - num_row_groups: 1 - format_version: 2.6 - serialized_size: ... - """ - -def read_schema( - where: str | Path | IO | NativeFile, - memory_map: bool = False, - decryption_properties: FileDecryptionProperties | None = None, - filesystem: SupportedFileSystem | None = None, -) -> Schema: - """ - Read effective Arrow schema from Parquet file metadata. - - Parameters - ---------- - where : str (file path) or file-like object - memory_map : bool, default False - Create memory map when the source is a file path. - decryption_properties : FileDecryptionProperties, default None - Decryption properties for reading encrypted Parquet files. - filesystem : FileSystem, default None - If nothing passed, will be inferred based on path. - Path will try to be found in the local on-disk filesystem otherwise - it will be parsed as an URI to determine the filesystem. - - Returns - ------- - schema : pyarrow.Schema - The schema of the Parquet file - - Examples - -------- - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) - >>> pq.write_table(table, "example.parquet") - - >>> pq.read_schema("example.parquet") - n_legs: int64 - animal: string - """ diff --git a/pyarrow-stubs/parquet/encryption.pyi b/pyarrow-stubs/parquet/encryption.pyi deleted file mode 100644 index 5a77dae7ef7..00000000000 --- a/pyarrow-stubs/parquet/encryption.pyi +++ /dev/null @@ -1,15 +0,0 @@ -from pyarrow._parquet_encryption import ( - CryptoFactory, - DecryptionConfiguration, - EncryptionConfiguration, - KmsClient, - KmsConnectionConfig, -) - -__all__ = [ - "CryptoFactory", - "DecryptionConfiguration", - "EncryptionConfiguration", - "KmsClient", - "KmsConnectionConfig", -] diff --git a/pyarrow-stubs/substrait.pyi b/pyarrow-stubs/substrait.pyi deleted file mode 100644 index a56a8a5b40f..00000000000 --- a/pyarrow-stubs/substrait.pyi +++ /dev/null @@ -1,21 +0,0 @@ -from pyarrow._substrait import ( - BoundExpressions, - SubstraitSchema, - deserialize_expressions, - deserialize_schema, - get_supported_functions, - run_query, - serialize_expressions, - serialize_schema, -) - -__all__ = [ - "BoundExpressions", - "get_supported_functions", - "run_query", - "deserialize_expressions", - "serialize_expressions", - "deserialize_schema", - "serialize_schema", - "SubstraitSchema", -] diff --git a/pyarrow-stubs/types.pyi b/pyarrow-stubs/types.pyi deleted file mode 100644 index 0cb4f6171d3..00000000000 --- a/pyarrow-stubs/types.pyi +++ /dev/null @@ -1,194 +0,0 @@ -import sys - -from typing import Any - -if sys.version_info >= (3, 13): - from typing import TypeIs -else: - from typing_extensions import TypeIs -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -from pyarrow.lib import ( - BinaryType, - BinaryViewType, - BoolType, - DataType, - Date32Type, - Date64Type, - Decimal32Type, - Decimal64Type, - Decimal128Type, - Decimal256Type, - DenseUnionType, - DictionaryType, - DurationType, - FixedSizeBinaryType, - FixedSizeListType, - Float16Type, - Float32Type, - Float64Type, - Int8Type, - Int16Type, - Int32Type, - Int64Type, - LargeBinaryType, - LargeListType, - LargeListViewType, - LargeStringType, - ListType, - ListViewType, - MapType, - MonthDayNanoIntervalType, - NullType, - RunEndEncodedType, - SparseUnionType, - StringType, - StringViewType, - StructType, - Time32Type, - Time64Type, - TimestampType, - UInt8Type, - UInt16Type, - Uint32Type, - UInt64Type, -) - -_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type -_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type -_Integer: TypeAlias = _SignedInteger | _UnsignedInteger -_Floating: TypeAlias = Float16Type | Float32Type | Float64Type -_Decimal: TypeAlias = ( - Decimal32Type[Any, Any] - | Decimal64Type[Any, Any] - | Decimal128Type[Any, Any] - | Decimal256Type[Any, Any] -) -_Date: TypeAlias = Date32Type | Date64Type -_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] -_Interval: TypeAlias = MonthDayNanoIntervalType -_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval -_Union: TypeAlias = SparseUnionType | DenseUnionType -_Nested: TypeAlias = ( - ListType[Any] - | FixedSizeListType[Any, Any] - | LargeListType[Any] - | ListViewType[Any] - | LargeListViewType[Any] - | StructType - | MapType[Any, Any, Any] - | _Union -) - -def is_null(t: DataType) -> TypeIs[NullType]: ... -def is_boolean(t: DataType) -> TypeIs[BoolType]: ... -def is_integer(t: DataType) -> TypeIs[_Integer]: ... -def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... -def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... -def is_int8(t: DataType) -> TypeIs[Int8Type]: ... -def is_int16(t: DataType) -> TypeIs[Int16Type]: ... -def is_int32(t: DataType) -> TypeIs[Int32Type]: ... -def is_int64(t: DataType) -> TypeIs[Int64Type]: ... -def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... -def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... -def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... -def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... -def is_floating(t: DataType) -> TypeIs[_Floating]: ... -def is_float16(t: DataType) -> TypeIs[Float16Type]: ... -def is_float32(t: DataType) -> TypeIs[Float32Type]: ... -def is_float64(t: DataType) -> TypeIs[Float64Type]: ... -def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... -def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... -def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... -def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... -def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... -def is_struct(t: DataType) -> TypeIs[StructType]: ... -def is_union(t: DataType) -> TypeIs[_Union]: ... -def is_nested(t: DataType) -> TypeIs[_Nested]: ... -def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... -def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... -def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... -def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... -def is_time(t: DataType) -> TypeIs[_Time]: ... -def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... -def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... -def is_binary(t: DataType) -> TypeIs[BinaryType]: ... -def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... -def is_unicode(t: DataType) -> TypeIs[StringType]: ... -def is_string(t: DataType) -> TypeIs[StringType]: ... -def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... -def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... -def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... -def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... -def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... -def is_date(t: DataType) -> TypeIs[_Date]: ... -def is_date32(t: DataType) -> TypeIs[Date32Type]: ... -def is_date64(t: DataType) -> TypeIs[Date64Type]: ... -def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... -def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... -def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... -def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... -def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... -def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... -def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... -def is_interval(t: DataType) -> TypeIs[_Interval]: ... -def is_primitive(t: DataType) -> bool: ... - -__all__ = [ - "is_binary", - "is_binary_view", - "is_boolean", - "is_date", - "is_date32", - "is_date64", - "is_decimal", - "is_decimal128", - "is_decimal256", - "is_decimal32", - "is_decimal64", - "is_dictionary", - "is_duration", - "is_fixed_size_binary", - "is_fixed_size_list", - "is_float16", - "is_float32", - "is_float64", - "is_floating", - "is_int16", - "is_int32", - "is_int64", - "is_int8", - "is_integer", - "is_interval", - "is_large_binary", - "is_large_list", - "is_large_list_view", - "is_large_string", - "is_large_unicode", - "is_list", - "is_list_view", - "is_map", - "is_nested", - "is_null", - "is_primitive", - "is_run_end_encoded", - "is_signed_integer", - "is_string", - "is_string_view", - "is_struct", - "is_temporal", - "is_time", - "is_time32", - "is_time64", - "is_timestamp", - "is_uint16", - "is_uint32", - "is_uint64", - "is_uint8", - "is_unicode", - "is_union", - "is_unsigned_integer", -] diff --git a/pyarrow-stubs/util.pyi b/pyarrow-stubs/util.pyi deleted file mode 100644 index c2ecf7d6b61..00000000000 --- a/pyarrow-stubs/util.pyi +++ /dev/null @@ -1,27 +0,0 @@ -from collections.abc import Callable -from os import PathLike -from typing import Any, Protocol, Sequence, TypeVar - -_F = TypeVar("_F", bound=Callable) -_N = TypeVar("_N") - -class _DocStringComponents(Protocol): - _docstring_components: list[str] - -def doc( - *docstrings: str | _DocStringComponents | Callable | None, **params: Any -) -> Callable[[_F], _F]: ... -def _is_iterable(obj) -> bool: ... -def _is_path_like(path) -> bool: ... -def _stringify_path(path: str | PathLike) -> str: ... -def product(seq: Sequence[_N]) -> _N: ... -def get_contiguous_span( - shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int -) -> tuple[int, int]: ... -def find_free_port() -> int: ... -def guid() -> str: ... -def _download_urllib(url, out_path) -> None: ... -def _download_requests(url, out_path) -> None: ... -def download_tzdata_on_windows() -> None: ... -def _deprecate_api(old_name, new_name, api, next_version, type=...): ... -def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index db614454ac3..00000000000 --- a/pyproject.toml +++ /dev/null @@ -1,99 +0,0 @@ -#:schema https://json.schemastore.org/pyproject.json - -[project] -name = "pyarrow-stubs" -version = "20.0.0.20250716" -description = "Type annotations for pyarrow" -authors = [{ name = "ZhengYu, Xu", email = "zen-xu@outlook.com" }] -license = "BSD-2-Clause" -classifiers = [ - "License :: OSI Approved :: BSD License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", -] -requires-python = ">=3.9,<4" -dependencies = ["pyarrow >=20"] - -[project.urls] -homepage = "https://github.com/zen-xu/pyarrow-stubs" -repository = "https://github.com/zen-xu/pyarrow-stubs.git" -issues = "https://github.com/zen-xu/pyarrow-stubs/issues" - -[build-system] -build-backend = "hatchling.build" -requires = ["hatchling"] - -[tool.hatch.build.targets.wheel] -packages = ["pyarrow-stubs"] - -[tool.isort] -profile = "black" - -[tool.pixi.project] -channels = ["conda-forge"] -platforms = ["win-64", "linux-64", "osx-64", "osx-arm64"] - -[tool.pixi.dependencies] -python = "3.11.*" -pip = "*" - -[tool.pixi.pypi-dependencies] -pyarrow-stubs = { path = ".", editable = true } -ipython = "*" -scipy = "*" -pre-commit = "*" -ruff = ">=0.5" -types-cffi = "*" -pandas-stubs = "*" -hatchling = "*" -fsspec = "*" -sparse = "*" -pyright = { version = ">=1.1.385,<2", extras = ["nodejs"] } - -[tool.pixi.tasks] -pyright = { cmd = "pyright" } -pre-commit = { cmd = "pre-commit" } - -[tool.ruff] -fix = true -line-length = 99 -target-version = "py38" - -[tool.ruff.lint] -extend-select = [ - "I", # isort - "N", # pep8-naming - "PYI", # flake8-pyi -] -ignore = [ - "PYI011", # typed-argument-default-in-stub - "PYI021", # docstring-in-stub - "PYI015", # assignment-default-in-stub - "PYI063", # pep484-style-positional-only-parameter - "N818", # error-suffix-on-exception-name -] - -[tool.ruff.lint.isort] -lines-after-imports = 2 -lines-between-types = 1 - -[tool.ruff.format] -docstring-code-format = true - -[tool.pyright] -typeCheckingMode = "strict" -reportMissingImports = false -reportPrivateUsage = false -reportUnknownParameterType = false -reportMissingTypeArgument = false -reportMissingParameterType = false -reportMissingTypeStubs = false -reportUnknownVariableType = false -reportUnknownArgumentType = false -reportUnknownMemberType = false diff --git a/LICENSE b/python/stubs/LICENSE similarity index 100% rename from LICENSE rename to python/stubs/LICENSE diff --git a/taplo.toml b/taplo.toml deleted file mode 100644 index 69418d9d7de..00000000000 --- a/taplo.toml +++ /dev/null @@ -1,5 +0,0 @@ -include = ["*.toml"] - -[formatting] -align_entries = true -inline_table_expand = false From e1d078ff48e84edb97cd96e6e5bdab51f202bcf7 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 12:37:54 +0200 Subject: [PATCH 223/231] Exclude `stubs` from `rat` test --- dev/release/rat_exclude_files.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 51c01516e7c..c4dc26e7784 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -95,6 +95,7 @@ python/pyarrow/tests/__init__.py python/pyarrow/vendored/* python/pyarrow/src/arrow/python/vendored/* python/requirements*.txt +python/stubs/* pax_global_header MANIFEST.in __init__.pxd From aa3289c9c2aa944c9442a57dff808e2bae1e50e4 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 13:19:41 +0200 Subject: [PATCH 224/231] Add Apache licence clause to `py.typed` --- python/py.typed | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/py.typed b/python/py.typed index e69de29bb2d..13a83393a91 100644 --- a/python/py.typed +++ b/python/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. From e41ac42bb186fd59100347a3bc55aabeb7ccaa7a Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 14:47:48 +0200 Subject: [PATCH 225/231] Reduce list --- python/pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 49ff3ae3f86..8ea499560f7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -102,19 +102,15 @@ fallback_version = '21.0.0a0' environment.root = [ "./pyarrow", ] -rules.call-non-callable = "ignore" rules.invalid-argument-type = "ignore" rules.invalid-assignment = "ignore" rules.invalid-context-manager = "ignore" -rules.invalid-ignore-comment = "ignore" rules.invalid-return-type = "ignore" rules.invalid-type-form = "ignore" -rules.missing-argument = "ignore" rules.no-matching-overload = "ignore" rules.non-subscriptable = "ignore" rules.not-iterable = "ignore" rules.possibly-unbound-attribute = "ignore" -rules.possibly-unbound-import = "ignore" rules.too-many-positional-arguments = "ignore" rules.unknown-argument = "ignore" rules.unresolved-attribute = "ignore" From 1d3fe980b955829e63db45dedcbe5497d7938d7c Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 15:22:08 +0200 Subject: [PATCH 226/231] Add `ty` as a step in the action --- .github/workflows/python.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15dfa11fc4c..eac0105f158 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,6 +138,11 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} + - name: Type check with ty + run: |- + python -m pip install ty + ty check + macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 runs-on: macos-${{ matrix.macos-version }} From 9e6c31d0423c73a5c21ad431b62b8bb9177583cb Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 15:46:54 +0200 Subject: [PATCH 227/231] Run in the correct directory --- .github/workflows/python.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index eac0105f158..61960381405 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -139,9 +139,10 @@ jobs: run: archery docker push ${{ matrix.image }} - name: Type check with ty + working-directory: python run: |- - python -m pip install ty - ty check + python -m pip install ty check + python -m ty check macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 From b74bacc3854e8fd0806b19975b5247317ec1918c Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 15:54:15 +0200 Subject: [PATCH 228/231] Remove `check` from `pip` --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 61960381405..15906a10ac0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -141,7 +141,7 @@ jobs: - name: Type check with ty working-directory: python run: |- - python -m pip install ty check + python -m pip install ty python -m ty check macos: From 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 16:32:41 +0200 Subject: [PATCH 229/231] Fix `unresolved-reference` error --- python/pyproject.toml | 1 - python/stubs/__lib_pxi/types.pyi | 3 ++- python/stubs/_fs.pyi | 4 ++-- python/stubs/compute.pyi | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8ea499560f7..e1c03b321a1 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -116,5 +116,4 @@ rules.unknown-argument = "ignore" rules.unresolved-attribute = "ignore" rules.unresolved-global = "ignore" rules.unresolved-import = "ignore" -rules.unresolved-reference = "ignore" rules.unsupported-operator = "ignore" diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index 7fe6c36e332..a7b6062b275 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -29,7 +29,6 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -186,6 +185,8 @@ class DataType(_Weakrefable): ArrowSchema pointer. """ +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 7670ef5230d..9f6e28dcf0f 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -19,8 +19,6 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() @@ -618,6 +616,8 @@ class FileSystem(_Weakrefable): The normalized path """ +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class LocalFileSystem(FileSystem): """ A FileSystem implementation accessing files on the local machine. diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 8d8fc35b134..7eb3445f640 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -216,9 +216,9 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) From 35635750f9ed6a8b0e60dc2dedc2fa0f00daaa10 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Sun, 20 Jul 2025 16:33:52 +0200 Subject: [PATCH 230/231] Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. --- python/pyproject.toml | 1 + python/stubs/__lib_pxi/types.pyi | 3 +-- python/stubs/_fs.pyi | 4 ++-- python/stubs/compute.pyi | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index e1c03b321a1..8ea499560f7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -116,4 +116,5 @@ rules.unknown-argument = "ignore" rules.unresolved-attribute = "ignore" rules.unresolved-global = "ignore" rules.unresolved-import = "ignore" +rules.unresolved-reference = "ignore" rules.unsupported-operator = "ignore" diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index a7b6062b275..7fe6c36e332 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -29,6 +29,7 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -185,8 +186,6 @@ class DataType(_Weakrefable): ArrowSchema pointer. """ -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) - class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 9f6e28dcf0f..7670ef5230d 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -19,6 +19,8 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() @@ -616,8 +618,6 @@ class FileSystem(_Weakrefable): The normalized path """ -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - class LocalFileSystem(FileSystem): """ A FileSystem implementation accessing files on the local machine. diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 7eb3445f640..8d8fc35b134 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -216,9 +216,9 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) From 65416d6a75ab01f3b72dd3282e217535b058b575 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 22 Jul 2025 02:40:13 +0200 Subject: [PATCH 231/231] Fix invalid-type-form errors --- python/pyproject.toml | 1 - python/stubs/__lib_pxi/array.pyi | 3 ++- python/stubs/__lib_pxi/io.pyi | 3 ++- python/stubs/__lib_pxi/table.pyi | 3 ++- python/stubs/compute.pyi | 3 ++- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8ea499560f7..d36d1d5234c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -106,7 +106,6 @@ rules.invalid-argument-type = "ignore" rules.invalid-assignment = "ignore" rules.invalid-context-manager = "ignore" rules.invalid-return-type = "ignore" -rules.invalid-type-form = "ignore" rules.no-matching-overload = "ignore" rules.non-subscriptable = "ignore" rules.not-iterable = "ignore" diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index ec1cda30a88..ffdb8a9c075 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -1,3 +1,4 @@ +import builtins import datetime as dt import sys @@ -1990,7 +1991,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): @overload def __getitem__(self, key: int) -> _Scalar_co: ... @overload - def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key: builtins.slice) -> Self: ... def __getitem__(self, key): """ Slice or return value at given index diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi index d882fd79d57..37c8aefb06b 100644 --- a/python/stubs/__lib_pxi/io.pyi +++ b/python/stubs/__lib_pxi/io.pyi @@ -1,3 +1,4 @@ +import builtins import sys from collections.abc import Callable @@ -578,7 +579,7 @@ class Buffer(_Weakrefable): @property def parent(self) -> Buffer | None: ... @overload - def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key: builtins.slice) -> Self: ... @overload def __getitem__(self, key: int) -> int: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index ad9d0392137..ad34e9b6dff 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -1,3 +1,4 @@ +import builtins import datetime as dt import sys @@ -294,7 +295,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): """ def __sizeof__(self) -> int: ... @overload - def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key: builtins.slice) -> Self: ... @overload def __getitem__(self, key: int) -> _Scalar_co: ... def __getitem__(self, key): diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 8d8fc35b134..f9039731ee6 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -93,6 +93,7 @@ from . import lib _P = ParamSpec("_P") _R = TypeVar("_R") +_CallableType = Callable[_P, _R] def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: """Reference a column of the dataset. @@ -156,7 +157,7 @@ def scalar(value: bool | float | str) -> Expression: An Expression representing the scalar value """ -def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... +def _clone_signature(f: _CallableType) -> _CallableType: ... # ============= compute functions ============= _DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType)