diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index da2fe966475..6d2fae99e44 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,7 +41,7 @@ except ImportError: # Package is not installed, parse git tag at runtime try: - import setuptools_scm + import setuptools_scm # type: ignore[import-untyped] # Code duplicated from setup.py to avoid a dependency on each other def parse_git(root, **kwargs): @@ -49,16 +49,16 @@ def parse_git(root, **kwargs): Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse + from setuptools_scm.git import parse # type: ignore[import-untyped] kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" return parse(root, **kwargs) __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: - __version__ = None + __version__ = "" -import pyarrow.lib as _lib +import pyarrow.lib as _lib # type: ignore[import-not-found] from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, MonthDayNano, VersionInfo, build_info, cpp_build_info, cpp_version, cpp_version_info, runtime_info, diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index fe0afdb0a87..feaafe37323 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow._compute import ( # noqa +from pyarrow._compute import ( # type: ignore[import-not-found] # noqa Function, FunctionOptions, FunctionRegistry, @@ -251,7 +251,7 @@ def wrapper(*args, memory_pool=None): return Expression._call(func_name, list(args)) return func.call(args, None, memory_pool) else: - def wrapper(*args, memory_pool=None, options=None, **kwargs): + def wrapper(*args, memory_pool=None, options=None, **kwargs): # type: ignore if arity is not Ellipsis: if len(args) < arity: raise TypeError( diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 157dbdf9380..089bb1473b4 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -21,7 +21,7 @@ from pyarrow.util import _is_path_like, _stringify_path -from pyarrow._fs import ( # noqa +from pyarrow._fs import ( # type: ignore[import-not-found] # noqa FileSelector, FileType, FileInfo, @@ -40,22 +40,22 @@ _not_imported = [] try: - from pyarrow._azurefs import AzureFileSystem # noqa + from pyarrow._azurefs import AzureFileSystem # type: ignore[import-not-found] # noqa except ImportError: _not_imported.append("AzureFileSystem") try: - from pyarrow._hdfs import HadoopFileSystem # noqa + from pyarrow._hdfs import HadoopFileSystem # type: ignore[import-not-found] # noqa except ImportError: _not_imported.append("HadoopFileSystem") try: - from pyarrow._gcsfs import GcsFileSystem # noqa + from pyarrow._gcsfs import GcsFileSystem # type: ignore[import-not-found] # noqa except ImportError: _not_imported.append("GcsFileSystem") try: - from pyarrow._s3fs import ( # noqa + from pyarrow._s3fs import ( # type: ignore[import-not-found] # noqa AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) @@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False): else: # handle fsspec-compatible filesystems try: - import fsspec + import fsspec # type: ignore[import-untyped] except ImportError: pass else: diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index ddbceabcb00..b2631a5736e 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -22,7 +22,6 @@ Any, Dict, Iterable, - Optional, Tuple, ) @@ -122,13 +121,13 @@ class ColumnBuffers(TypedDict): # first element is a buffer containing mask values indicating missing data; # second element is the mask value buffer's associated dtype. # None if the null representation is not a bit or byte mask - validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + validity: Tuple[_PyArrowBuffer, Dtype] | None # first element is a buffer containing the offset values for # variable-size binary data (e.g., variable-length strings); # second element is the offsets buffer's associated dtype. # None if the data buffer does not have an associated offsets buffer - offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + offsets: Tuple[_PyArrowBuffer, Dtype] | None class CategoricalDescription(TypedDict): @@ -139,7 +138,7 @@ class CategoricalDescription(TypedDict): is_dictionary: bool # Python-level only (e.g. ``{int: str}``). # None if not a dictionary-style categorical. - categories: Optional[_PyArrowColumn] + categories: _PyArrowColumn | None class Endianness: @@ -314,13 +313,20 @@ def _dtype_from_arrowdtype( kind = DtypeKind.CATEGORICAL arr = self._col indices_dtype = arr.indices.type - _, f_string = _PYARROW_KINDS.get(indices_dtype) + indices_dtype_tuple = _PYARROW_KINDS.get(indices_dtype) + if indices_dtype_tuple is None: + raise ValueError( + f"Data type {indices_dtype} not supported by interchange protocol" + ) + _, f_string = indices_dtype_tuple return kind, bit_width, f_string, Endianness.NATIVE else: - kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) - if kind is None: + optional_kind, f_string = _PYARROW_KINDS.get(dtype, (None, "")) + if optional_kind is None: raise ValueError( - f"Data type {dtype} not supported by interchange protocol") + f"Data type {dtype} not supported by interchange protocol" + ) + kind = optional_kind return kind, bit_width, f_string, Endianness.NATIVE @@ -379,7 +385,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: return ColumnNullType.USE_BITMASK, 0 @property - def null_count(self) -> int: + def null_count(self) -> int | None: """ Number of null elements, if known. @@ -394,7 +400,7 @@ def metadata(self) -> Dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ - pass + return {} def num_chunks(self) -> int: """ @@ -403,7 +409,7 @@ def num_chunks(self) -> int: return 1 def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable[_PyArrowColumn]: """ Return an iterator yielding the chunks. @@ -486,6 +492,11 @@ def _get_data_buffer( return _PyArrowBuffer(array.buffers()[1]), dtype elif n == 3: return _PyArrowBuffer(array.buffers()[2]), dtype + else: + raise ValueError( + "Column data buffer must have 2 or 3 buffers, " + f"but has {n} buffers: {array.buffers()}" + ) def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: """ @@ -505,7 +516,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: "There are no missing values so " "does not have a separate mask") - def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any] | None: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. @@ -527,3 +538,4 @@ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: else: dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE) return _PyArrowBuffer(array.buffers()[1]), dtype + return None diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 59ba765c175..87c888ae761 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -19,7 +19,6 @@ from typing import ( Any, Iterable, - Optional, Sequence, ) @@ -174,7 +173,7 @@ def select_columns_by_name( ) def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable[_PyArrowDataFrame]: """ Return an iterator yielding the chunks. diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index fcaec41e3dc..29086d9ed6d 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -346,7 +346,7 @@ def buffers_to_array( buffers: ColumnBuffers, data_type: Tuple[DtypeKind, int, str, str], length: int, - describe_null: ColumnNullType, + describe_null: tuple[ColumnNullType, int], offset: int = 0, allow_copy: bool = True, ) -> pa.Array: @@ -383,14 +383,12 @@ def buffers_to_array( the returned PyArrow array is being used. """ data_buff, _ = buffers["data"] - try: - validity_buff, validity_dtype = buffers["validity"] - except TypeError: - validity_buff = None - try: - offset_buff, offset_dtype = buffers["offsets"] - except TypeError: - offset_buff = None + validity_buff, validity_dtype = ( + buffers["validity"] if buffers["validity"] else (None, None) + ) + offset_buff, offset_dtype = ( + buffers["offsets"] if buffers["offsets"] else (None, None) + ) # Construct a pyarrow Buffer data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize, @@ -398,6 +396,7 @@ def buffers_to_array( # Construct a validity pyarrow Buffer, if applicable if validity_buff: + assert validity_dtype is not None validity_pa_buff = validity_buffer_from_mask(validity_buff, validity_dtype, describe_null, @@ -416,6 +415,7 @@ def buffers_to_array( data_dtype = map_date_type(data_type) if offset_buff: + assert offset_dtype is not None _, offset_bit_width, _, _ = offset_dtype # If an offset buffer exists, construct an offset pyarrow Buffer # and add it to the construction of an array @@ -450,7 +450,7 @@ def buffers_to_array( def validity_buffer_from_mask( validity_buff: BufferObject, validity_dtype: Dtype, - describe_null: ColumnNullType, + describe_null: tuple[ColumnNullType, int], length: int, offset: int = 0, allow_copy: bool = True, @@ -513,7 +513,7 @@ def validity_buffer_from_mask( offset=offset) if sentinel_val == 1: - mask_bool = pc.invert(mask_bool) + mask_bool = pc.invert(mask_bool) # type: ignore # (missing stubs) return mask_bool.buffers()[1] @@ -529,7 +529,7 @@ def validity_buffer_from_mask( def validity_buffer_nan_sentinel( data_pa_buffer: BufferObject, data_type: Dtype, - describe_null: ColumnNullType, + describe_null: tuple[ColumnNullType, int], length: int, offset: int = 0, allow_copy: bool = True, @@ -583,8 +583,8 @@ def validity_buffer_nan_sentinel( [None, data_pa_buffer], offset=offset, ) - mask = pc.is_nan(pyarrow_data) - mask = pc.invert(mask) + mask = pc.is_nan(pyarrow_data) # type: ignore # (missing stubs) + mask = pc.invert(mask) # type: ignore # (missing stubs) return mask.buffers()[1] # Check for sentinel values @@ -603,8 +603,9 @@ def validity_buffer_nan_sentinel( length, [None, data_pa_buffer], offset=offset) - sentinel_arr = pc.equal(pyarrow_data, sentinel_val) - mask_bool = pc.invert(sentinel_arr) + # typing ignores due to missing stubs + sentinel_arr = pc.equal(pyarrow_data, sentinel_val) # type: ignore + mask_bool = pc.invert(sentinel_arr) # type: ignore return mask_bool.buffers()[1] elif null_kind == ColumnNullType.NON_NULLABLE: diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 4e236678788..7433e108651 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -21,7 +21,7 @@ import pyarrow as pa -from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # noqa +from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # type: ignore[import-not-found] # noqa Message, MessageReader, RecordBatchReader, _ReadPandasMixin, MetadataVersion, Alignment, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index ab4e5d1b992..54d4a595b6d 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -20,7 +20,7 @@ from enum import IntEnum -from pyarrow.lib import (is_boolean_value, # noqa +from pyarrow.lib import (is_boolean_value, # type: ignore[import-not-found] # noqa is_integer_value, is_float_value) diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 6c4d6e01400..1e24fec0e08 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -618,7 +618,7 @@ def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, self._cls = cls if 'sphinx' in sys.modules: - from sphinx.ext.autodoc import ALL + from sphinx.ext.autodoc import ALL # type: ignore[import-not-found] else: ALL = object()