-
Notifications
You must be signed in to change notification settings - Fork 0
Ensure pyarrow/interchange passes type checks by mypy, pyright and ty #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,24 +41,24 @@ | |
| except ImportError: | ||
| # Package is not installed, parse git tag at runtime | ||
| try: | ||
| import setuptools_scm | ||
| import setuptools_scm # type: ignore[import-untyped] | ||
| # Code duplicated from setup.py to avoid a dependency on each other | ||
|
|
||
| def parse_git(root, **kwargs): | ||
| """ | ||
| Parse function for setuptools_scm that ignores tags for non-C++ | ||
| subprojects, e.g. apache-arrow-js-XXX tags. | ||
| """ | ||
| from setuptools_scm.git import parse | ||
| from setuptools_scm.git import parse # type: ignore[import-untyped] | ||
| kwargs['describe_command'] = \ | ||
| "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" | ||
| return parse(root, **kwargs) | ||
| __version__ = setuptools_scm.get_version('../', | ||
| parse=parse_git) | ||
| except ImportError: | ||
| __version__ = None | ||
| __version__ = "" | ||
|
|
||
| import pyarrow.lib as _lib | ||
| import pyarrow.lib as _lib # type: ignore[import-not-found] | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Example of internal stubs missing. |
||
| from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, | ||
| MonthDayNano, VersionInfo, build_info, cpp_build_info, | ||
| cpp_version, cpp_version_info, runtime_info, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| # specific language governing permissions and limitations | ||
| # under the License. | ||
|
|
||
| from pyarrow._compute import ( # noqa | ||
| from pyarrow._compute import ( # type: ignore[import-not-found] # noqa | ||
| Function, | ||
| FunctionOptions, | ||
| FunctionRegistry, | ||
|
|
@@ -251,7 +251,7 @@ def wrapper(*args, memory_pool=None): | |
| return Expression._call(func_name, list(args)) | ||
| return func.call(args, None, memory_pool) | ||
| else: | ||
| def wrapper(*args, memory_pool=None, options=None, **kwargs): | ||
| def wrapper(*args, memory_pool=None, options=None, **kwargs): # type: ignore | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a nasty one, as the wrapper signature is inconsistent within two logical branches. I couldn't find a quick fix. |
||
| if arity is not Ellipsis: | ||
| if len(args) < arity: | ||
| raise TypeError( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ | |
| Any, | ||
| Dict, | ||
| Iterable, | ||
| Optional, | ||
| Tuple, | ||
| ) | ||
|
|
||
|
|
@@ -122,13 +121,13 @@ class ColumnBuffers(TypedDict): | |
| # first element is a buffer containing mask values indicating missing data; | ||
| # second element is the mask value buffer's associated dtype. | ||
| # None if the null representation is not a bit or byte mask | ||
| validity: Optional[Tuple[_PyArrowBuffer, Dtype]] | ||
| validity: Tuple[_PyArrowBuffer, Dtype] | None | ||
|
|
||
| # first element is a buffer containing the offset values for | ||
| # variable-size binary data (e.g., variable-length strings); | ||
| # second element is the offsets buffer's associated dtype. | ||
| # None if the data buffer does not have an associated offsets buffer | ||
| offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] | ||
| offsets: Tuple[_PyArrowBuffer, Dtype] | None | ||
|
|
||
|
|
||
| class CategoricalDescription(TypedDict): | ||
|
|
@@ -139,7 +138,7 @@ class CategoricalDescription(TypedDict): | |
| is_dictionary: bool | ||
| # Python-level only (e.g. ``{int: str}``). | ||
| # None if not a dictionary-style categorical. | ||
| categories: Optional[_PyArrowColumn] | ||
| categories: _PyArrowColumn | None | ||
|
|
||
|
|
||
| class Endianness: | ||
|
|
@@ -314,13 +313,20 @@ def _dtype_from_arrowdtype( | |
| kind = DtypeKind.CATEGORICAL | ||
| arr = self._col | ||
| indices_dtype = arr.indices.type | ||
| _, f_string = _PYARROW_KINDS.get(indices_dtype) | ||
| indices_dtype_tuple = _PYARROW_KINDS.get(indices_dtype) | ||
| if indices_dtype_tuple is None: | ||
| raise ValueError( | ||
| f"Data type {indices_dtype} not supported by interchange protocol" | ||
| ) | ||
| _, f_string = indices_dtype_tuple | ||
| return kind, bit_width, f_string, Endianness.NATIVE | ||
| else: | ||
| kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) | ||
| if kind is None: | ||
| optional_kind, f_string = _PYARROW_KINDS.get(dtype, (None, "")) | ||
| if optional_kind is None: | ||
| raise ValueError( | ||
| f"Data type {dtype} not supported by interchange protocol") | ||
| f"Data type {dtype} not supported by interchange protocol" | ||
| ) | ||
| kind = optional_kind | ||
|
|
||
| return kind, bit_width, f_string, Endianness.NATIVE | ||
|
|
||
|
|
@@ -379,7 +385,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: | |
| return ColumnNullType.USE_BITMASK, 0 | ||
|
|
||
| @property | ||
| def null_count(self) -> int: | ||
| def null_count(self) -> int | None: | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Spec uses Optional and we're currently supporting from 3.9 on and pyarrow-stubs uses |. I'd go with |.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. Given we're supporting 3.9 and 3.9 did not yet have
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switched all the |
||
| """ | ||
| Number of null elements, if known. | ||
|
|
||
|
|
@@ -394,7 +400,7 @@ def metadata(self) -> Dict[str, Any]: | |
| """ | ||
| The metadata for the column. See `DataFrame.metadata` for more details. | ||
| """ | ||
| pass | ||
| return {} | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This didn't break any tests and seemed "the right thing to do", but it would be great if somebody with more insight into the project looks at this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty dict seems ok as per spec this is implementing. Actual metadata would be even better, but that's out of scope :). |
||
|
|
||
| def num_chunks(self) -> int: | ||
| """ | ||
|
|
@@ -403,7 +409,7 @@ def num_chunks(self) -> int: | |
| return 1 | ||
|
|
||
| def get_chunks( | ||
| self, n_chunks: Optional[int] = None | ||
| self, n_chunks: int | None = None | ||
| ) -> Iterable[_PyArrowColumn]: | ||
| """ | ||
| Return an iterator yielding the chunks. | ||
|
|
@@ -486,6 +492,11 @@ def _get_data_buffer( | |
| return _PyArrowBuffer(array.buffers()[1]), dtype | ||
| elif n == 3: | ||
| return _PyArrowBuffer(array.buffers()[2]), dtype | ||
| else: | ||
| raise ValueError( | ||
| "Column data buffer must have 2 or 3 buffers, " | ||
| f"but has {n} buffers: {array.buffers()}" | ||
| ) | ||
|
|
||
| def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: | ||
| """ | ||
|
|
@@ -505,7 +516,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: | |
| "There are no missing values so " | ||
| "does not have a separate mask") | ||
|
|
||
| def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: | ||
| def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any] | None: | ||
| """ | ||
| Return the buffer containing the offset values for variable-size binary | ||
| data (e.g., variable-length strings) and the buffer's associated dtype. | ||
|
|
@@ -527,3 +538,4 @@ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: | |
| else: | ||
| dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE) | ||
| return _PyArrowBuffer(array.buffers()[1]), dtype | ||
| return None | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -346,7 +346,7 @@ def buffers_to_array( | |||
| buffers: ColumnBuffers, | ||||
| data_type: Tuple[DtypeKind, int, str, str], | ||||
| length: int, | ||||
| describe_null: ColumnNullType, | ||||
| describe_null: tuple[ColumnNullType, int], | ||||
| offset: int = 0, | ||||
| allow_copy: bool = True, | ||||
| ) -> pa.Array: | ||||
|
|
@@ -383,21 +383,20 @@ def buffers_to_array( | |||
| the returned PyArrow array is being used. | ||||
| """ | ||||
| data_buff, _ = buffers["data"] | ||||
| try: | ||||
| validity_buff, validity_dtype = buffers["validity"] | ||||
| except TypeError: | ||||
| validity_buff = None | ||||
| try: | ||||
| offset_buff, offset_dtype = buffers["offsets"] | ||||
| except TypeError: | ||||
| offset_buff = None | ||||
| validity_buff, validity_dtype = ( | ||||
| buffers["validity"] if buffers["validity"] else (None, None) | ||||
| ) | ||||
| offset_buff, offset_dtype = ( | ||||
| buffers["offsets"] if buffers["offsets"] else (None, None) | ||||
| ) | ||||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm rather sure this fixes the actual implementation, as I am not really sure TypedDict actually throws any TypeErrors as what the original logic was counting on. I might be wrong, and there was some other reason TypeErrors were expected. Either solution passes the unit tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this approach work? validity_buff, validity_dtype = buffers.get("validity", (None, None))
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sadly no, as per the signature of arrow/python/pyarrow/interchange/column.py Line 125 in a606011
Same for offsets.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could rework the ColumnBuffer to make this nicer (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation! Agreed, let's keep changes to a minimum. |
||||
|
|
||||
| # Construct a pyarrow Buffer | ||||
| data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize, | ||||
| base=data_buff) | ||||
|
|
||||
| # Construct a validity pyarrow Buffer, if applicable | ||||
| if validity_buff: | ||||
| assert validity_dtype is not None | ||||
| validity_pa_buff = validity_buffer_from_mask(validity_buff, | ||||
| validity_dtype, | ||||
| describe_null, | ||||
|
|
@@ -416,6 +415,7 @@ def buffers_to_array( | |||
| data_dtype = map_date_type(data_type) | ||||
|
|
||||
| if offset_buff: | ||||
| assert offset_dtype is not None | ||||
| _, offset_bit_width, _, _ = offset_dtype | ||||
| # If an offset buffer exists, construct an offset pyarrow Buffer | ||||
| # and add it to the construction of an array | ||||
|
|
@@ -450,7 +450,7 @@ def buffers_to_array( | |||
| def validity_buffer_from_mask( | ||||
| validity_buff: BufferObject, | ||||
| validity_dtype: Dtype, | ||||
| describe_null: ColumnNullType, | ||||
| describe_null: tuple[ColumnNullType, int], | ||||
| length: int, | ||||
| offset: int = 0, | ||||
| allow_copy: bool = True, | ||||
|
|
@@ -513,7 +513,7 @@ def validity_buffer_from_mask( | |||
| offset=offset) | ||||
|
|
||||
| if sentinel_val == 1: | ||||
| mask_bool = pc.invert(mask_bool) | ||||
| mask_bool = pc.invert(mask_bool) # type: ignore # (missing stubs) | ||||
|
|
||||
| return mask_bool.buffers()[1] | ||||
|
|
||||
|
|
@@ -529,7 +529,7 @@ def validity_buffer_from_mask( | |||
| def validity_buffer_nan_sentinel( | ||||
| data_pa_buffer: BufferObject, | ||||
| data_type: Dtype, | ||||
| describe_null: ColumnNullType, | ||||
| describe_null: tuple[ColumnNullType, int], | ||||
| length: int, | ||||
| offset: int = 0, | ||||
| allow_copy: bool = True, | ||||
|
|
@@ -583,8 +583,8 @@ def validity_buffer_nan_sentinel( | |||
| [None, data_pa_buffer], | ||||
| offset=offset, | ||||
| ) | ||||
| mask = pc.is_nan(pyarrow_data) | ||||
| mask = pc.invert(mask) | ||||
| mask = pc.is_nan(pyarrow_data) # type: ignore # (missing stubs) | ||||
| mask = pc.invert(mask) # type: ignore # (missing stubs) | ||||
| return mask.buffers()[1] | ||||
|
|
||||
| # Check for sentinel values | ||||
|
|
@@ -603,8 +603,9 @@ def validity_buffer_nan_sentinel( | |||
| length, | ||||
| [None, data_pa_buffer], | ||||
| offset=offset) | ||||
| sentinel_arr = pc.equal(pyarrow_data, sentinel_val) | ||||
| mask_bool = pc.invert(sentinel_arr) | ||||
| # typing ignores due to missing stubs | ||||
| sentinel_arr = pc.equal(pyarrow_data, sentinel_val) # type: ignore | ||||
| mask_bool = pc.invert(sentinel_arr) # type: ignore | ||||
| return mask_bool.buffers()[1] | ||||
|
|
||||
| elif null_kind == ColumnNullType.NON_NULLABLE: | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Example of external dependencies not providing stubs or being typed.