From 4db2ca0743b2029f0c3c1049cdc5a9c453f55b15 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 13:42:37 +0000 Subject: [PATCH 01/18] UuidType -> RationalType in the docs In a previous version of the docs, a UuidType was discussed as an extension type. However, this type has been promoted to a canonical type, and so no longer is a good example of an extension type a user may wish to create. We replace UuidType in the docs with a RationalType --- docs/source/format/Integration.rst | 31 +++-- docs/source/python/extending_types.rst | 86 ++++++++----- python/pyarrow/types.pxi | 166 ++++++++++++++++--------- 3 files changed, 182 insertions(+), 101 deletions(-) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 0ab5b832ad0..b4e50a54b07 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -390,20 +390,37 @@ but can be of any type. Extension types are, as in the IPC format, represented as their underlying storage type plus some dedicated field metadata to reconstruct the extension -type. For example, assuming a "uuid" extension type backed by a -FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: +type. For example, assuming a "rational" extension type backed by a +``struct`` storage, here is how a "rational" field +would be represented:: { "name" : "name_of_the_field", "nullable" : /* boolean */, "type" : { - "name" : "fixedsizebinary", - "byteWidth" : 16 + "name" : "struct", }, - "children" : [], + "children" : [ + { + "name": "numer", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + }, + }, + { + "name": "denom", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + } + }, + ], "metadata" : [ - {"key": "ARROW:extension:name", "value": "uuid"}, - {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + {"key": "ARROW:extension:name", "value": "rational"}, + {"key": "ARROW:extension:metadata", "value": "rational-serialized"} ] } diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index d7465053481..67cb3425dc9 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -131,50 +131,72 @@ and serialization mechanism. The extension name and serialized metadata can potentially be recognized by other (non-Python) Arrow implementations such as PySpark. -For example, we could define a custom UUID type for 128-bit numbers which can -be represented as ``FixedSizeBinary`` type with 16 bytes:: +For example, we could define a custom rational type for fractions which can +be represented as a pair of integers:: - class UuidType(pa.ExtensionType): + import pyarrow.types as pt + + class RationalType(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), "my_package.uuid") - def __arrow_ext_serialize__(self): - # Since we don't have a parameterized type, we don't need extra - # metadata to be deserialized - return b'' + super().__init__( + pa.struct( + [ + ("numer", pa.int64()), + ("denom", pa.int64()), + ], + ), + "my_package.rational", + ) + + def __arrow_ext_serialize__(self) -> bytes: + # No serialized metadata necessary + return b"" @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): + def __arrow_ext_deserialize__(self, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert storage_type == pa.binary(16) + assert pt.is_int32(storage_type) assert serialized == b'' - # Return an instance of this subclass given the serialized - # metadata. - return UuidType() + + # return an instance of this subclass given the serialized + # metadata + return RationalType() + The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` -define the serialization of an extension type instance. For non-parametric -types such as the above, the serialization payload can be left empty. +define the serialization of an extension type instance. This can now be used to create arrays and tables holding the extension type:: - >>> uuid_type = UuidType() - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) - - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> rational_type = RationalType() + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type + ... ) + >>> arr = rational_type.wrap_array(storage_array) + >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr - + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 [ - A6861959108644B797664AEEE686B682, - 718747F48E5F4058A7261E2B6B228BE8, - 7FE201227D624D96A5CD8639DEF2A68B, - C6CA8C7F95744BFD9462A40B3F57A86C + 17, + 13 ] This array can be included in RecordBatches, sent over IPC and received in @@ -182,7 +204,7 @@ another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the storage type:: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType()) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -198,10 +220,10 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() >>> result.column('ext').type - UuidType(FixedSizeBinaryType(fixed_size_binary[16])) + RationalType(StructType(struct)) The receiving application doesn't need to be Python but can still recognize -the extension type as a "my_package.uuid" type, if it has implemented its own +the extension type as a "my_package.rational" type, if it has implemented its own extension type to receive it. If the type is not registered in the receiving application, it will fall back to the storage type. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 563782f0c26..1cb929eac6b 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1618,59 +1618,79 @@ cdef class ExtensionType(BaseExtensionType): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) - Create an instance of UuidType extension type: + Create an instance of RationalType extension type: - >>> uuid_type = UuidType() + >>> rational_type = RationalType(pa.int32()) Inspect the extension type: - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) Wrap an array as an extension array: - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> uuid_type.wrap_array(storage_array) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type + ... ) + >>> ratoinal_type.wrap_array(storage_array) - [ + -- is_valid: all not null ... - ] Or do the same with creating an ExtensionArray: - >>> pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> pa.ExtensionArray.from_storage(rational_type, storage_array) - [ + -- is_valid: all not null ... - ] Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, pyarrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type`` as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. """ def __cinit__(self): @@ -2039,30 +2059,41 @@ def register_extension_type(ext_type): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: DataType _type = ensure_type(ext_type, allow_none=False) @@ -2089,30 +2120,41 @@ def unregister_extension_type(type_name): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: c_string c_type_name = tobytes(type_name) From 4d95f227dd66bd5beab8ea292579bca9e55a4b34 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 13:47:34 +0000 Subject: [PATCH 02/18] fix json formatting --- docs/source/format/Integration.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index b4e50a54b07..f76aa0fefcf 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -398,7 +398,7 @@ would be represented:: "name" : "name_of_the_field", "nullable" : /* boolean */, "type" : { - "name" : "struct", + "name" : "struct" }, "children" : [ { @@ -407,7 +407,7 @@ would be represented:: "name": "int", "bitWidth": 32, "isSigned": true - }, + } }, { "name": "denom", @@ -416,7 +416,7 @@ would be represented:: "bitWidth": 32, "isSigned": true } - }, + } ], "metadata" : [ {"key": "ARROW:extension:name", "value": "rational"}, From fde32158fa5f9a5f943da5aacd2bd8056b7a5bb1 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 14:06:25 +0000 Subject: [PATCH 03/18] fix some typos --- docs/source/python/extending_types.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 67cb3425dc9..a881957ccce 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,6 +134,7 @@ such as PySpark. For example, we could define a custom rational type for fractions which can be represented as a pair of integers:: + import pyarrow as pa import pyarrow.types as pt class RationalType(pa.ExtensionType): @@ -143,8 +144,8 @@ be represented as a pair of integers:: super().__init__( pa.struct( [ - ("numer", pa.int64()), - ("denom", pa.int64()), + ("numer", pa.int32()), + ("denom", pa.int32()), ], ), "my_package.rational", @@ -157,7 +158,8 @@ be represented as a pair of integers:: @classmethod def __arrow_ext_deserialize__(self, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert pt.is_int32(storage_type) + assert pt.is_struct(storage_type) + assert pt.is_int32(storage_type[0].type) assert serialized == b'' # return an instance of this subclass given the serialized From d875d2cd3753ba05727ad3223a491c21383f23c8 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Tue, 27 Aug 2024 20:34:33 -0400 Subject: [PATCH 04/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index a881957ccce..af8e9b5cc63 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -156,7 +156,7 @@ be represented as a pair of integers:: return b"" @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): + def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. assert pt.is_struct(storage_type) assert pt.is_int32(storage_type[0].type) From eb690554a2d0bd968de977c8ca6360559d36a4e8 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Tue, 27 Aug 2024 20:35:04 -0400 Subject: [PATCH 05/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index af8e9b5cc63..95d686bfe0d 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -160,6 +160,7 @@ be represented as a pair of integers:: # Sanity checks, not required but illustrate the method signature. assert pt.is_struct(storage_type) assert pt.is_int32(storage_type[0].type) + assert pt.is_int32(storage_type[1].type) assert serialized == b'' # return an instance of this subclass given the serialized From 3fc68428545c325fa34ae52bfd9c1d430090ab9a Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Wed, 28 Aug 2024 01:03:00 +0000 Subject: [PATCH 06/18] response to ianmcook --- docs/source/python/extending_types.rst | 20 +++++++++----------- python/pyarrow/types.pxi | 20 ++++++++------------ 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 95d686bfe0d..2f1968deb09 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,9 +134,6 @@ such as PySpark. For example, we could define a custom rational type for fractions which can be represented as a pair of integers:: - import pyarrow as pa - import pyarrow.types as pt - class RationalType(pa.ExtensionType): def __init__(self): @@ -158,9 +155,9 @@ be represented as a pair of integers:: @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert pt.is_struct(storage_type) - assert pt.is_int32(storage_type[0].type) - assert pt.is_int32(storage_type[1].type) + assert pa.types.is_struct(storage_type) + assert pa.types.is_int32(storage_type[0].type) + assert pa.types.is_int32(storage_type[1].type) assert serialized == b'' # return an instance of this subclass given the serialized @@ -180,13 +177,14 @@ This can now be used to create arrays and tables holding the extension type:: StructType(struct) >>> storage_array = pa.array( - ... [ - ... {"numer": 10, "denom": 17}, - ... {"numer": 20, "denom": 13}, - ... ], - ... type=rational_type.storage_type + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, ... ) >>> arr = rational_type.wrap_array(storage_array) + >>> # or equivalently >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1cb929eac6b..3c3c63f04c8 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1620,8 +1620,6 @@ cdef class ExtensionType(BaseExtensionType): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -1641,7 +1639,7 @@ cdef class ExtensionType(BaseExtensionType): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) @@ -1670,14 +1668,16 @@ cdef class ExtensionType(BaseExtensionType): ... ], ... type=rational_type.storage_type ... ) - >>> ratoinal_type.wrap_array(storage_array) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array -- is_valid: all not null ... Or do the same with creating an ExtensionArray: - >>> pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array -- is_valid: all not null ... @@ -1750,7 +1750,7 @@ cdef class ExtensionType(BaseExtensionType): return NotImplementedError @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): + def __arrow_ext_deserialize__(cls, storage_type, serialized): """ Return an extension type instance from the storage type and serialized metadata. @@ -2061,8 +2061,6 @@ def register_extension_type(ext_type): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -2082,7 +2080,7 @@ def register_extension_type(ext_type): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) @@ -2122,8 +2120,6 @@ def unregister_extension_type(type_name): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -2143,7 +2139,7 @@ def unregister_extension_type(type_name): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) From 0e051ea858ba95dca6350703ba9e08c6fb745dee Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sat, 31 Aug 2024 21:40:12 +0000 Subject: [PATCH 07/18] define parameters --- docs/source/python/extending_types.rst | 123 ++++++++++++++++--------- python/pyarrow/types.pxi | 21 ++--- 2 files changed, 88 insertions(+), 56 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 2f1968deb09..a38118de29c 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -116,61 +116,68 @@ a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. Defining extension types ("user-defined types") ----------------------------------------------- -Arrow has the notion of extension types in the metadata specification as a -possibility to extend the built-in types. This is done by annotating any of the -built-in Arrow data types (the "storage type") with a custom type name and -optional serialized representation ("ARROW:extension:name" and -"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC -message). -See the :ref:`format_metadata_extension_types` section of the metadata -specification for more details. - -Pyarrow allows you to define such extension types from Python by subclassing -:class:`ExtensionType` and giving the derived class its own extension name -and serialization mechanism. The extension name and serialized metadata -can potentially be recognized by other (non-Python) Arrow implementations +Arrow affords a notion of extension types which allow users to annotate data +types with additional semantics. This allows downstream consumers both to +specify custom serialization and deserialization routines (for example, +to :ref:`Python scalars ` and +:ref:`pandas `) and to more easily interpret data. + +In the Arrow :doc:`metadata specification`, +this is accomplished by annotating any of the built-in Arrow data types +(the "storage type") with a custom type name and, optionally, a byte +array that can be used to provide additional metadata (referred to as +"parameters" in this documentation). These appear as the +``ARROW:extension:name`` and ``ARROW:extension:metadata`` keys in the +Field's ``custom_metadata``. + +Note that since these annotations are part of the Arrow specification, +they can potentially be recognized by other (non-Python) Arrow consumers such as PySpark. -For example, we could define a custom rational type for fractions which can -be represented as a pair of integers:: +Pyarrow allows you to define extension types from Python by subclassing +:class:`ExtensionType` and giving the derived class its own extension name +and mechanism to (de)serialize any parameters. For example, we could define +a custom rational type for fractions which can be represented as a pair of +integers:: class RationalType(pa.ExtensionType): - def __init__(self): + def __init__(self, data_type: pa.DataType): + if not pa.types.is_integer(data_type): + raise TypeError(f"data_type must be an integer type not {data_type}") super().__init__( pa.struct( [ - ("numer", pa.int32()), - ("denom", pa.int32()), + ("numer", data_type), + ("denom", data_type), ], ), "my_package.rational", ) def __arrow_ext_serialize__(self) -> bytes: - # No serialized metadata necessary + # No parameters are necessary return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. assert pa.types.is_struct(storage_type) - assert pa.types.is_int32(storage_type[0].type) - assert pa.types.is_int32(storage_type[1].type) - assert serialized == b'' + assert pa.types.is_integer(storage_type[0].type) + assert storage_type[0].type == storage_type[1].type + assert serialized == b"" - # return an instance of this subclass given the serialized - # metadata - return RationalType() + # return an instance of this subclass + return RationalType(storage_type[0].type) The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` -define the serialization of an extension type instance. +define the serialization and deserialization of an extension type instance. This can now be used to create arrays and tables holding the extension type:: - >>> rational_type = RationalType() + >>> rational_type = RationalType(pa.int32()) >>> rational_type.extension_name 'my_package.rational' >>> rational_type.storage_type @@ -205,7 +212,7 @@ another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the storage type:: - >>> pa.register_extension_type(RationalType()) + >>> pa.register_extension_type(RationalType(pa.int32())) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -220,19 +227,44 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() - >>> result.column('ext').type + >>> result.column("ext").type RationalType(StructType(struct)) +Further, note that while we registered the concrete type +``RationalType(pa.int32())``, ``RationalType(integer_type)`` has the same +extension name (``"my_package.rational"``) for all integer types. As such, +the above code will also allow users to (de)serialize these data types:: + + >>> big_rational_type = RationalType(pa.int64()) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=big_rational_type.storage_type, + ... ) + >>> arr = big_rational_type.wrap_array(storage_array) + >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + >>> sink = pa.BufferOutputStream() + >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: + ... writer.write_batch(batch) + >>> buf = sink.getvalue() + >>> with pa.ipc.open_stream(buf) as reader: + ... result = reader.read_all() + >>> result.column("ext").type + RationalType(StructType(struct)) + The receiving application doesn't need to be Python but can still recognize -the extension type as a "my_package.rational" type, if it has implemented its own +the extension type as a "my_package.rational" type if it has implemented its own extension type to receive it. If the type is not registered in the receiving application, it will fall back to the storage type. Parameterized extension type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The above example used a fixed storage type with no further metadata. But -more flexible, parameterized extension types are also possible. +The above example illustrated how to construct an extension type that requires +no additional metadata beyond its storage type. But Arrow also provides more +flexible, parameterized extension types. The example given here implements an extension type for the `pandas "period" data type `__, @@ -248,14 +280,14 @@ of the given frequency since 1970. # attributes need to be set first before calling # super init (as that calls serialize) self._freq = freq - super().__init__(pa.int64(), 'my_package.period') + super().__init__(pa.int64(), "my_package.period") @property def freq(self): return self._freq def __arrow_ext_serialize__(self): - return "freq={}".format(self.freq).encode() + return "freq={self.freq}".encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -263,7 +295,7 @@ of the given frequency since 1970. # metadata. serialized = serialized.decode() assert serialized.startswith("freq=") - freq = serialized.split('=')[1] + freq = serialized.split("=")[1] return PeriodType(freq) Here, we ensure to store all information in the serialized metadata that is @@ -297,7 +329,7 @@ the data as a 2-D Numpy array ``(N, 3)`` without any copy:: super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -336,6 +368,8 @@ This array can be sent over IPC, received in another Python process, and the cus extension array class will be preserved (as long as the receiving process registers the extension type using :func:`register_extension_type` before reading the IPC data). +.. _custom-scalar-conversion: + Custom scalar conversion ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -358,7 +392,7 @@ For example, if we wanted the above example 3D point type to return a custom super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -377,6 +411,7 @@ Arrays built using this extension type now provide scalars that convert to our ` >>> arr.to_pylist() [Point3D(x=1.0, y=2.0, z=3.0), Point3D(x=4.0, y=5.0, z=6.0)] +.. _conversion-to-pandas: Conversion to pandas ~~~~~~~~~~~~~~~~~~~~ @@ -459,16 +494,16 @@ Extension arrays can be used as columns in ``pyarrow.Table`` or >>> data = [ ... pa.array([1, 2, 3]), - ... pa.array(['foo', 'bar', None]), + ... pa.array(["foo", "bar", None]), ... pa.array([True, None, True]), ... tensor_array, ... tensor_array_2 ... ] - >>> my_schema = pa.schema([('f0', pa.int8()), - ... ('f1', pa.string()), - ... ('f2', pa.bool_()), - ... ('tensors_int', tensor_type), - ... ('tensors_float', tensor_type_2)]) + >>> my_schema = pa.schema([("f0", pa.int8()), + ... ("f1", pa.string()), + ... ("f2", pa.bool_()), + ... ("tensors_int", tensor_type), + ... ("tensors_float", tensor_type_2)]) >>> table = pa.Table.from_arrays(data, schema=my_schema) >>> table pyarrow.Table @@ -564,7 +599,7 @@ or .. code-block:: python - >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=['C', 'H', 'W']) + >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"]) for ``NCHW`` format where: diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 3c3c63f04c8..70129f51eee 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1622,7 +1622,7 @@ cdef class ExtensionType(BaseExtensionType): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -1636,12 +1636,11 @@ cdef class ExtensionType(BaseExtensionType): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: @@ -2063,7 +2062,7 @@ def register_extension_type(ext_type): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -2077,12 +2076,11 @@ def register_extension_type(ext_type): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: @@ -2122,7 +2120,7 @@ def unregister_extension_type(type_name): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -2136,12 +2134,11 @@ def unregister_extension_type(type_name): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: From 8614fd5d793056fd74a43e86f586eb6c0f808223 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Sun, 8 Sep 2024 12:00:49 -0400 Subject: [PATCH 08/18] Update python/pyarrow/types.pxi Co-authored-by: Ian Cook --- python/pyarrow/types.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 70129f51eee..33dc638124b 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1686,8 +1686,8 @@ cdef class ExtensionType(BaseExtensionType): >>> pa.unregister_extension_type("my_package.rational") Note that even though we registered the concrete type - ``RationalType(pa.int64())``, pyarrow will be able to deserialize - ``RationalType(integer_type)`` for any ``integer_type`` as the deserializer + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer will reference the name ``my_package.rational`` and the ``@classmethod`` ``__arrow_ext_deserialize__``. """ From 03b3fd992ce9208d95e507ce50f2f7ecfc788902 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Sun, 8 Sep 2024 12:01:13 -0400 Subject: [PATCH 09/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index a38118de29c..328840c43dd 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,7 +134,7 @@ Note that since these annotations are part of the Arrow specification, they can potentially be recognized by other (non-Python) Arrow consumers such as PySpark. -Pyarrow allows you to define extension types from Python by subclassing +PyArrow allows you to define extension types from Python by subclassing :class:`ExtensionType` and giving the derived class its own extension name and mechanism to (de)serialize any parameters. For example, we could define a custom rational type for fractions which can be represented as a pair of From 2abdcb82391c00f32e57dcd73dcf8e600d411178 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:21:47 +0000 Subject: [PATCH 10/18] more edits --- docs/source/format/Columnar.rst | 12 ++++++------ docs/source/python/extending_types.rst | 15 ++++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index c5f822f4164..da20830ff74 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1596,12 +1596,12 @@ structure. These extension keys are: they should not be used for third-party extension types. This extension metadata can annotate any of the built-in Arrow logical -types. The intent is that an implementation that does not support an -extension type can still handle the underlying data. For example a -16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and -implementations that do not have this extension type can still work -with the underlying binary values and pass along the -``custom_metadata`` in subsequent Arrow protocol messages. +types. For example, Arrow specifies a canonical extension type that +represents a UUID as a FixedSizeBinary(16). Arrow implementations are +not required to support canonical extensions, so an implementation that +does not support this UUID type will simply interpret it as a +``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in +subsequent Arrow protocol messages. Extension types may or may not use the ``'ARROW:extension:metadata'`` field. Let's consider some example diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 328840c43dd..c14a2e65fa2 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -117,13 +117,13 @@ Defining extension types ("user-defined types") ----------------------------------------------- Arrow affords a notion of extension types which allow users to annotate data -types with additional semantics. This allows downstream consumers both to +types with additional semantics. This allows developers both to specify custom serialization and deserialization routines (for example, to :ref:`Python scalars ` and :ref:`pandas `) and to more easily interpret data. -In the Arrow :doc:`metadata specification`, -this is accomplished by annotating any of the built-in Arrow data types +In Arrow, :ref:`extension types ` +are specified by annotating any of the built-in Arrow data types (the "storage type") with a custom type name and, optionally, a byte array that can be used to provide additional metadata (referred to as "parameters" in this documentation). These appear as the @@ -231,9 +231,10 @@ and then reading it back yields the proper type:: RationalType(StructType(struct)) Further, note that while we registered the concrete type -``RationalType(pa.int32())``, ``RationalType(integer_type)`` has the same -extension name (``"my_package.rational"``) for all integer types. As such, -the above code will also allow users to (de)serialize these data types:: +``RationalType(pa.int32())``, the same extension name +(``"my_package.rational"``) is used by ``RationalType(integer_type)`` +for *all* Arrow integer types. As such, the above code also allows users to +(de)serialize these data types:: >>> big_rational_type = RationalType(pa.int64()) >>> storage_array = pa.array( @@ -287,7 +288,7 @@ of the given frequency since 1970. return self._freq def __arrow_ext_serialize__(self): - return "freq={self.freq}".encode() + return "freq={}".format(self.freq).encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): From 4c66e4d71627acad27c93f5fea188f3764d8a434 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:27:56 +0000 Subject: [PATCH 11/18] missed one formatting --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index da20830ff74..4c758c52943 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1597,7 +1597,7 @@ structure. These extension keys are: This extension metadata can annotate any of the built-in Arrow logical types. For example, Arrow specifies a canonical extension type that -represents a UUID as a FixedSizeBinary(16). Arrow implementations are +represents a UUID as a ``FixedSizeBinary(16)``. Arrow implementations are not required to support canonical extensions, so an implementation that does not support this UUID type will simply interpret it as a ``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in From 253156baac193a9f3b791fe4459da624538606b5 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:30:59 +0000 Subject: [PATCH 12/18] import pyarrow in doctests examples --- python/pyarrow/types.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 33dc638124b..604867d4734 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1620,6 +1620,7 @@ cdef class ExtensionType(BaseExtensionType): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): From da6cdf29dbe5acae3614c54160fa99390373066f Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 18:49:11 +0000 Subject: [PATCH 13/18] import pyarrow in more doctests examples --- python/pyarrow/types.pxi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 604867d4734..64a984731c1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2061,6 +2061,7 @@ def register_extension_type(ext_type): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): @@ -2119,6 +2120,7 @@ def unregister_extension_type(type_name): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): From 8871efa3f1139e6440e72dc212f5760aad409aaa Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sat, 14 Sep 2024 13:38:11 -0400 Subject: [PATCH 14/18] fix spacing and version issues --- python/pyarrow/types.pxi | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 64a984731c1..dd5b0ac323f 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1672,7 +1672,16 @@ cdef class ExtensionType(BaseExtensionType): >>> rational_array -- is_valid: all not null - ... + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] Or do the same with creating an ExtensionArray: @@ -1680,7 +1689,16 @@ cdef class ExtensionType(BaseExtensionType): >>> rational_array -- is_valid: all not null - ... + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] Unregister the extension type: @@ -4328,8 +4346,12 @@ def float16(): 15872, 32256 ] - >>> a.to_pylist() - [np.float16(1.5), np.float16(nan)] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [, ] """ return primitive_type(_Type_HALF_FLOAT) From ac65cfe81ade0aa73b84260243ce1ada67d8fc22 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Mon, 16 Sep 2024 19:05:07 -0400 Subject: [PATCH 15/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index c14a2e65fa2..da3de124624 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -124,8 +124,8 @@ to :ref:`Python scalars ` and In Arrow, :ref:`extension types ` are specified by annotating any of the built-in Arrow data types -(the "storage type") with a custom type name and, optionally, a byte -array that can be used to provide additional metadata (referred to as +(the "storage type") with a custom type name and, optionally, a +bytestring that can be used to provide additional metadata (referred to as "parameters" in this documentation). These appear as the ``ARROW:extension:name`` and ``ARROW:extension:metadata`` keys in the Field's ``custom_metadata``. From 0d373ef3f1502bfcf164f6e555e6ee25c0744a54 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Mon, 16 Sep 2024 22:09:11 -0400 Subject: [PATCH 16/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index da3de124624..4f3a7a1a6b0 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -197,10 +197,10 @@ This can now be used to create arrays and tables holding the extension type:: -- is_valid: all not null -- child 0 type: int32 - [ + [ 10, 20 - ] + ] -- child 1 type: int32 [ 17, From 933c5bbffb6509db4dea9108828314ce3cad008a Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Mon, 16 Sep 2024 22:09:18 -0400 Subject: [PATCH 17/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 4f3a7a1a6b0..adf69eb1d6e 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -202,7 +202,7 @@ This can now be used to create arrays and tables holding the extension type:: 20 ] -- child 1 type: int32 - [ + [ 17, 13 ] From 0502ff821f84c10de69550a960f63d6663803b76 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Mon, 16 Sep 2024 22:09:25 -0400 Subject: [PATCH 18/18] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index adf69eb1d6e..264cc8dca77 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -205,7 +205,7 @@ This can now be used to create arrays and tables holding the extension type:: [ 17, 13 - ] + ] This array can be included in RecordBatches, sent over IPC and received in another Python process. The receiving process must explicitly register the