Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,24 +41,24 @@
except ImportError:
# Package is not installed, parse git tag at runtime
try:
import setuptools_scm
import setuptools_scm # type: ignore[import-untyped]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example of external dependencies not providing stubs or being typed.

# Code duplicated from setup.py to avoid a dependency on each other

def parse_git(root, **kwargs):
"""
Parse function for setuptools_scm that ignores tags for non-C++
subprojects, e.g. apache-arrow-js-XXX tags.
"""
from setuptools_scm.git import parse
from setuptools_scm.git import parse # type: ignore[import-untyped]
kwargs['describe_command'] = \
"git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'"
return parse(root, **kwargs)
__version__ = setuptools_scm.get_version('../',
parse=parse_git)
except ImportError:
__version__ = None
__version__ = ""

import pyarrow.lib as _lib
import pyarrow.lib as _lib # type: ignore[import-not-found]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example of internal stubs missing.

from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path,
MonthDayNano, VersionInfo, build_info, cpp_build_info,
cpp_version, cpp_version_info, runtime_info,
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.

from pyarrow._compute import ( # noqa
from pyarrow._compute import ( # type: ignore[import-not-found] # noqa
Function,
FunctionOptions,
FunctionRegistry,
Expand Down Expand Up @@ -251,7 +251,7 @@ def wrapper(*args, memory_pool=None):
return Expression._call(func_name, list(args))
return func.call(args, None, memory_pool)
else:
def wrapper(*args, memory_pool=None, options=None, **kwargs):
def wrapper(*args, memory_pool=None, options=None, **kwargs): # type: ignore
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nasty one, as the wrapper signature is inconsistent within two logical branches. I couldn't find a quick fix.

if arity is not Ellipsis:
if len(args) < arity:
raise TypeError(
Expand Down
12 changes: 6 additions & 6 deletions python/pyarrow/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from pyarrow.util import _is_path_like, _stringify_path

from pyarrow._fs import ( # noqa
from pyarrow._fs import ( # type: ignore[import-not-found] # noqa
FileSelector,
FileType,
FileInfo,
Expand All @@ -40,22 +40,22 @@

_not_imported = []
try:
from pyarrow._azurefs import AzureFileSystem # noqa
from pyarrow._azurefs import AzureFileSystem # type: ignore[import-not-found] # noqa
except ImportError:
_not_imported.append("AzureFileSystem")

try:
from pyarrow._hdfs import HadoopFileSystem # noqa
from pyarrow._hdfs import HadoopFileSystem # type: ignore[import-not-found] # noqa
except ImportError:
_not_imported.append("HadoopFileSystem")

try:
from pyarrow._gcsfs import GcsFileSystem # noqa
from pyarrow._gcsfs import GcsFileSystem # type: ignore[import-not-found] # noqa
except ImportError:
_not_imported.append("GcsFileSystem")

try:
from pyarrow._s3fs import ( # noqa
from pyarrow._s3fs import ( # type: ignore[import-not-found] # noqa
AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy,
S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized,
finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region)
Expand Down Expand Up @@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False):
else:
# handle fsspec-compatible filesystems
try:
import fsspec
import fsspec # type: ignore[import-untyped]
except ImportError:
pass
else:
Expand Down
36 changes: 24 additions & 12 deletions python/pyarrow/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
Any,
Dict,
Iterable,
Optional,
Tuple,
)

Expand Down Expand Up @@ -122,13 +121,13 @@ class ColumnBuffers(TypedDict):
# first element is a buffer containing mask values indicating missing data;
# second element is the mask value buffer's associated dtype.
# None if the null representation is not a bit or byte mask
validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
validity: Tuple[_PyArrowBuffer, Dtype] | None

# first element is a buffer containing the offset values for
# variable-size binary data (e.g., variable-length strings);
# second element is the offsets buffer's associated dtype.
# None if the data buffer does not have an associated offsets buffer
offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
offsets: Tuple[_PyArrowBuffer, Dtype] | None


class CategoricalDescription(TypedDict):
Expand All @@ -139,7 +138,7 @@ class CategoricalDescription(TypedDict):
is_dictionary: bool
# Python-level only (e.g. ``{int: str}``).
# None if not a dictionary-style categorical.
categories: Optional[_PyArrowColumn]
categories: _PyArrowColumn | None


class Endianness:
Expand Down Expand Up @@ -314,13 +313,20 @@ def _dtype_from_arrowdtype(
kind = DtypeKind.CATEGORICAL
arr = self._col
indices_dtype = arr.indices.type
_, f_string = _PYARROW_KINDS.get(indices_dtype)
indices_dtype_tuple = _PYARROW_KINDS.get(indices_dtype)
if indices_dtype_tuple is None:
raise ValueError(
f"Data type {indices_dtype} not supported by interchange protocol"
)
_, f_string = indices_dtype_tuple
return kind, bit_width, f_string, Endianness.NATIVE
else:
kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
if kind is None:
optional_kind, f_string = _PYARROW_KINDS.get(dtype, (None, ""))
if optional_kind is None:
raise ValueError(
f"Data type {dtype} not supported by interchange protocol")
f"Data type {dtype} not supported by interchange protocol"
)
kind = optional_kind

return kind, bit_width, f_string, Endianness.NATIVE

Expand Down Expand Up @@ -379,7 +385,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
return ColumnNullType.USE_BITMASK, 0

@property
def null_count(self) -> int:
def null_count(self) -> int | None:
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using | operator here that is only available from python 3.10 - do we rather want to stick to Optional[X], which is supported earlier already? Or maybe rather add from __future__ import annotations? In any case, we should chose one and be consistent within the project.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spec uses Optional and we're currently supporting from 3.9 on and pyarrow-stubs uses |. I'd go with |.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Given we're supporting 3.9 and 3.9 did not yet have | I suppose we add from __future__ import annotations?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched all the Optional to | within interchange folder. annotations import was arleady there.

"""
Number of null elements, if known.

Expand All @@ -394,7 +400,7 @@ def metadata(self) -> Dict[str, Any]:
"""
The metadata for the column. See `DataFrame.metadata` for more details.
"""
pass
return {}
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This didn't break any tests and seemed "the right thing to do", but it would be great if somebody with more insight into the project looks at this.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty dict seems ok as per spec this is implementing. Actual metadata would be even better, but that's out of scope :).


def num_chunks(self) -> int:
"""
Expand All @@ -403,7 +409,7 @@ def num_chunks(self) -> int:
return 1

def get_chunks(
self, n_chunks: Optional[int] = None
self, n_chunks: int | None = None
) -> Iterable[_PyArrowColumn]:
"""
Return an iterator yielding the chunks.
Expand Down Expand Up @@ -486,6 +492,11 @@ def _get_data_buffer(
return _PyArrowBuffer(array.buffers()[1]), dtype
elif n == 3:
return _PyArrowBuffer(array.buffers()[2]), dtype
else:
raise ValueError(
"Column data buffer must have 2 or 3 buffers, "
f"but has {n} buffers: {array.buffers()}"
)

def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
"""
Expand All @@ -505,7 +516,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
"There are no missing values so "
"does not have a separate mask")

def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any] | None:
"""
Return the buffer containing the offset values for variable-size binary
data (e.g., variable-length strings) and the buffer's associated dtype.
Expand All @@ -527,3 +538,4 @@ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
else:
dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
return _PyArrowBuffer(array.buffers()[1]), dtype
return None
3 changes: 1 addition & 2 deletions python/pyarrow/interchange/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from typing import (
Any,
Iterable,
Optional,
Sequence,
)

Expand Down Expand Up @@ -174,7 +173,7 @@ def select_columns_by_name(
)

def get_chunks(
self, n_chunks: Optional[int] = None
self, n_chunks: int | None = None
) -> Iterable[_PyArrowDataFrame]:
"""
Return an iterator yielding the chunks.
Expand Down
33 changes: 17 additions & 16 deletions python/pyarrow/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def buffers_to_array(
buffers: ColumnBuffers,
data_type: Tuple[DtypeKind, int, str, str],
length: int,
describe_null: ColumnNullType,
describe_null: tuple[ColumnNullType, int],
offset: int = 0,
allow_copy: bool = True,
) -> pa.Array:
Expand Down Expand Up @@ -383,21 +383,20 @@ def buffers_to_array(
the returned PyArrow array is being used.
"""
data_buff, _ = buffers["data"]
try:
validity_buff, validity_dtype = buffers["validity"]
except TypeError:
validity_buff = None
try:
offset_buff, offset_dtype = buffers["offsets"]
except TypeError:
offset_buff = None
validity_buff, validity_dtype = (
buffers["validity"] if buffers["validity"] else (None, None)
)
offset_buff, offset_dtype = (
buffers["offsets"] if buffers["offsets"] else (None, None)
)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm rather sure this fixes the actual implementation, as I am not really sure TypedDict actually throws any TypeErrors as what the original logic was counting on. I might be wrong, and there was some other reason TypeErrors were expected. Either solution passes the unit tests.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this approach work?

validity_buff, validity_dtype = buffers.get("validity", (None, None))

Copy link
Owner Author

@mpelko mpelko Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly no, as per the signature of ColumnBuffers validity field is optional and as such the getter actually receives a value (None) which can not be mapped to (None, None). See:

validity: Optional[Tuple[_PyArrowBuffer, Dtype]]

Same for offsets.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could rework the ColumnBuffer to make this nicer (e.g. Tuple[_PyArrowBuffer, Dtype] is a good candidate for a dedicated type given that it's used in several places), but I'm trying to keep the changes at the minimum here.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation! Agreed, let's keep changes to a minimum.


# Construct a pyarrow Buffer
data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
base=data_buff)

# Construct a validity pyarrow Buffer, if applicable
if validity_buff:
assert validity_dtype is not None
validity_pa_buff = validity_buffer_from_mask(validity_buff,
validity_dtype,
describe_null,
Expand All @@ -416,6 +415,7 @@ def buffers_to_array(
data_dtype = map_date_type(data_type)

if offset_buff:
assert offset_dtype is not None
_, offset_bit_width, _, _ = offset_dtype
# If an offset buffer exists, construct an offset pyarrow Buffer
# and add it to the construction of an array
Expand Down Expand Up @@ -450,7 +450,7 @@ def buffers_to_array(
def validity_buffer_from_mask(
validity_buff: BufferObject,
validity_dtype: Dtype,
describe_null: ColumnNullType,
describe_null: tuple[ColumnNullType, int],
length: int,
offset: int = 0,
allow_copy: bool = True,
Expand Down Expand Up @@ -513,7 +513,7 @@ def validity_buffer_from_mask(
offset=offset)

if sentinel_val == 1:
mask_bool = pc.invert(mask_bool)
mask_bool = pc.invert(mask_bool) # type: ignore # (missing stubs)

return mask_bool.buffers()[1]

Expand All @@ -529,7 +529,7 @@ def validity_buffer_from_mask(
def validity_buffer_nan_sentinel(
data_pa_buffer: BufferObject,
data_type: Dtype,
describe_null: ColumnNullType,
describe_null: tuple[ColumnNullType, int],
length: int,
offset: int = 0,
allow_copy: bool = True,
Expand Down Expand Up @@ -583,8 +583,8 @@ def validity_buffer_nan_sentinel(
[None, data_pa_buffer],
offset=offset,
)
mask = pc.is_nan(pyarrow_data)
mask = pc.invert(mask)
mask = pc.is_nan(pyarrow_data) # type: ignore # (missing stubs)
mask = pc.invert(mask) # type: ignore # (missing stubs)
return mask.buffers()[1]

# Check for sentinel values
Expand All @@ -603,8 +603,9 @@ def validity_buffer_nan_sentinel(
length,
[None, data_pa_buffer],
offset=offset)
sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
mask_bool = pc.invert(sentinel_arr)
# typing ignores due to missing stubs
sentinel_arr = pc.equal(pyarrow_data, sentinel_val) # type: ignore
mask_bool = pc.invert(sentinel_arr) # type: ignore
return mask_bool.buffers()[1]

elif null_kind == ColumnNullType.NON_NULLABLE:
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import pyarrow as pa

from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # noqa
from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # type: ignore[import-not-found] # noqa
Message, MessageReader,
RecordBatchReader, _ReadPandasMixin,
MetadataVersion, Alignment,
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from enum import IntEnum

from pyarrow.lib import (is_boolean_value, # noqa
from pyarrow.lib import (is_boolean_value, # type: ignore[import-not-found] # noqa
is_integer_value,
is_float_value)

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/vendored/docscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc,
self._cls = cls

if 'sphinx' in sys.modules:
from sphinx.ext.autodoc import ALL
from sphinx.ext.autodoc import ALL # type: ignore[import-not-found]
else:
ALL = object()

Expand Down