diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 89f28ee416e..6b0ac8c23c7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -375,6 +375,7 @@ set(ARROW_SRCS device.cc extension_type.cc extension/bool8.cc + extension/uuid.cc pretty_print.cc record_batch.cc result.cc @@ -1225,6 +1226,7 @@ add_subdirectory(testing) add_subdirectory(array) add_subdirectory(c) add_subdirectory(compute) +add_subdirectory(extension) add_subdirectory(io) add_subdirectory(tensor) add_subdirectory(util) @@ -1267,7 +1269,6 @@ endif() if(ARROW_JSON) add_subdirectory(json) - add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9065e286a22..76ad9c7d650 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 5cb4bc77af2..065ea3f1ddb 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 3fd39a11ff5..842a78e1a4f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -23,7 +23,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" +#include "arrow/ipc/test_common.h" #include "arrow/record_batch.h" #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" @@ -33,6 +33,7 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; +using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; @@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test { std::string serialized_; }; -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc new file mode 100644 index 00000000000..43b917a17f8 --- /dev/null +++ b/cpp/src/arrow/extension/uuid.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension_type.h" +#include "arrow/util/logging.h" + +#include "arrow/extension/uuid.h" + +namespace arrow::extension { + +bool UuidType::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.uuid", + static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> UuidType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (!serialized.empty()) { + return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); + } + if (!storage_type->Equals(*fixed_size_binary(16))) { + return Status::Invalid("Invalid storage type for UuidType: ", + storage_type->ToString()); + } + return std::make_shared(); +} + +std::string UuidType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() << ">"; + return ss.str(); +} + +std::shared_ptr uuid() { return std::make_shared(); } + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h new file mode 100644 index 00000000000..42bb21cf0b2 --- /dev/null +++ b/cpp/src/arrow/extension/uuid.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow::extension { + +/// \brief UuidArray stores array of UUIDs. Underlying storage type is +/// FixedSizeBinary(16). +class ARROW_EXPORT UuidArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief UuidType is a canonical arrow extension type for UUIDs. +/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this +/// does not interpret the bytes in any way. Specific UUID version is not +/// required or guaranteed. +class ARROW_EXPORT UuidType : public ExtensionType { + public: + /// \brief Construct a UuidType. + UuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "arrow.uuid"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + /// Create a UuidArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return ""; } + + /// \brief Create a UuidType instance + static Result> Make() { return std::make_shared(); } +}; + +/// \brief Return a UuidType instance. +ARROW_EXPORT std::shared_ptr uuid(); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc new file mode 100644 index 00000000000..3bbb6eeb4ae --- /dev/null +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/uuid.h" + +#include "arrow/testing/matchers.h" + +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/key_value_metadata.h" + +#include "arrow/testing/extension_type.h" + +namespace arrow { + +using arrow::ipc::test::RoundtripBatch; + +TEST(TestUuuidExtensionType, ExtensionTypeTest) { + auto type = uuid(); + ASSERT_EQ(type->id(), Type::EXTENSION); + + const auto& ext_type = static_cast(*type); + std::string serialized = ext_type.Serialize(); + + ASSERT_OK_AND_ASSIGN(auto deserialized, + ext_type.Deserialize(fixed_size_binary(16), serialized)); + ASSERT_TRUE(deserialized->Equals(*type)); + ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); +} + +TEST(TestUuuidExtensionType, RoundtripBatch) { + auto ext_type = extension::uuid(); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])"); + auto ext_arr = ExtensionType::WrapArray(ext_type, arr); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", ""}}); + ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(), + /*nullable=*/true, /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 83c7ebed4f3..fc220f73a6b 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -32,6 +32,7 @@ #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/opaque.h" #endif +#include "arrow/extension/uuid.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -147,14 +148,13 @@ static void CreateGlobalRegistry() { // Register canonical extension types g_registry = std::make_shared(); - std::vector> ext_types{extension::bool8()}; + std::vector> ext_types{extension::bool8(), extension::uuid()}; #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); #endif - // Register canonical extension types for (const auto& ext_type : ext_types) { ARROW_CHECK_OK( g_registry->RegisterType(checked_pointer_cast(ext_type))); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index f104c984a64..f49ffc5cba5 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -30,6 +30,7 @@ #include "arrow/io/memory.h" #include "arrow/ipc/options.h" #include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -41,6 +42,8 @@ namespace arrow { +using arrow::ipc::test::RoundtripBatch; + class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType { class TestExtensionType : public ::testing::Test { public: - void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } + void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } void TearDown() { if (GetExtensionType("uuid")) { @@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { ASSERT_EQ(deserialized->byte_width(), 16); } -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, IpcRoundtrip) { auto ext_arr = ExampleUuid(); auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr}); diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index 9b56928c688..0e84ea6124d 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) { auto storage_array = ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])"); - AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array)); + AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array)); AssertArraysEqual(*batch->column(1), NullArray(2)); } diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 87c02e2d87a..fb4f6bd8ead 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -27,8 +27,10 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" #include "arrow/ipc/test_common.h" +#include "arrow/ipc/writer.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls, std::shared_ptr* out) { std::vector values(length); random_null_bytes(length, 0.5, values.data()); - ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values)); + ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values)); if (include_nulls) { std::vector valid_bytes(length); - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(valid_bytes)); random_null_bytes(length, 0.1, valid_bytes.data()); *out = std::make_shared(length, data, null_bitmap, -1); } else { @@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr* out) { std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(null_bytes)); std::shared_ptr with_nulls( new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1)); @@ -1088,9 +1090,9 @@ Status MakeUuid(std::shared_ptr* out) { auto f1 = field("f1", uuid_type, /*nullable=*/false); auto schema = ::arrow::schema({f0, f1}); - auto a0 = std::make_shared( + auto a0 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])")); - auto a1 = std::make_shared( + auto a1 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK", "JIHGFEDBA9876543"])")); @@ -1176,12 +1178,13 @@ enable_if_t::value, void> FillRandomData( Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed) { - const auto& element_type = internal::checked_cast(*type); + const auto& element_type = arrow::internal::checked_cast(*type); std::vector strides; if (row_major_p) { - RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape, &strides)); } else { - RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK( + arrow::internal::ComputeColumnMajorStrides(element_type, shape, &strides)); } const int64_t element_size = element_type.bit_width() / CHAR_BIT; @@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr& type, return Tensor::Make(type, buf, shape, strides).Value(out); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index db8613cbb1e..9b7e7f13e3a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed = 0); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 104a5697b57..e9ec13e98b4 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -43,7 +43,6 @@ namespace arrow { using compute::Cast; using compute::CastOptions; - using internal::checked_cast; using internal::checked_pointer_cast; @@ -2038,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test { void SetUp() { type_ = uuid(); storage_type_ = fixed_size_binary(16); - uuid_type_ = checked_cast(type_.get()); + uuid_type_ = checked_cast(type_.get()); } protected: @@ -2049,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test { } std::shared_ptr type_, storage_type_; - const UuidType* uuid_type_{nullptr}; + const ExampleUuidType* uuid_type_{nullptr}; const std::string_view uuid_string1_{UUID_STRING1}; const std::string_view uuid_string2_{UUID_STRING2}; diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 6515631f202..a4526e31c2b 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -27,14 +27,14 @@ namespace arrow { -class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray { +class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; }; -class ARROW_TESTING_EXPORT UuidType : public ExtensionType { +class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType { public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} + ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {} std::string extension_name() const override { return "uuid"; } diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 95de16c715f..ae2e53b30a3 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,9 +49,13 @@ #include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/datum.h" +#include "arrow/io/memory.h" #include "arrow/ipc/json_simple.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -847,17 +851,17 @@ Future<> SleepABitAsync() { /////////////////////////////////////////////////////////////////////////// // Extension types -bool UuidType::ExtensionEquals(const ExtensionType& other) const { +bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); } -std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { +std::shared_ptr ExampleUuidType::MakeArray(std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); - return std::make_shared(data); + return std::make_shared(data); } -Result> UuidType::Deserialize( +Result> ExampleUuidType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { if (serialized != "uuid-serialized") { return Status::Invalid("Type identifier did not match: '", serialized, "'"); @@ -866,7 +870,7 @@ Result> UuidType::Deserialize( return Status::Invalid("Invalid storage type for UuidType: ", storage_type->ToString()); } - return std::make_shared(); + return std::make_shared(); } bool SmallintType::ExtensionEquals(const ExtensionType& other) const { @@ -982,7 +986,7 @@ Result> Complex128Type::Deserialize( return std::make_shared(); } -std::shared_ptr uuid() { return std::make_shared(); } +std::shared_ptr uuid() { return std::make_shared(); } std::shared_ptr smallint() { return std::make_shared(); } diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 47310c905a9..38338bdc210 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('uuid', 'uuid-serialized', + uuid_type = ExtensionType('arrow.uuid', '', FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5658f949cee..1106f8aaffd 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -272,6 +272,8 @@ JSON In the future, additional fields may be added, but they are not required to interpret the array. +.. _uuid_extension: + UUID ==== diff --git a/docs/source/status.rst b/docs/source/status.rst index 5e2c2cc19c8..b685d4bbf8a 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -121,7 +121,7 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | JSON | | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| UUID | | | ✓ | | | | | | +| UUID | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | 8-bit Boolean | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 807bcdc3150..d31c93119b7 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,9 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - fixed_shape_tensor, - opaque, - bool8, + bool8, fixed_shape_tensor, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -184,8 +182,9 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, FixedShapeTensorType, OpaqueType, - Bool8Type, PyExtensionType, UnknownExtensionType, + RunEndEncodedType, Bool8Type, FixedShapeTensorType, + OpaqueType, UuidType, + PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -218,8 +217,9 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray, - Bool8Array, scalar, NA, _NULL as NULL, Scalar, + RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, + OpaqueArray, UuidArray, + scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, @@ -235,8 +235,8 @@ def print_entry(label, value): StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, - RunEndEncodedScalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar) + RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, + FixedShapeTensorScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 77d6c9c06d2..1587de0e6b7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4338,6 +4338,12 @@ cdef class ExtensionArray(Array): return result +class UuidArray(ExtensionArray): + """ + Concrete class for Arrow arrays of UUID data type. + """ + + cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6f510cfc0c0..c2346750a19 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2865,6 +2865,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: + cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray): + pass + + cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index a7c3b496a00..5c3d981c3ad 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type +cdef class UuidType(BaseExtensionType): + cdef: + const CUuidType* uuid_ext_type cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 19a26bd6c68..d3e2ff2e99d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -120,14 +120,17 @@ cdef api object pyarrow_wrap_data_type( elif type.get().id() == _Type_EXTENSION: ext_type = type.get() cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) + extension_name = ext_type.extension_name() if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() - elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": + elif extension_name == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) + elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) - elif ext_type.extension_name() == b"arrow.opaque": + elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) - elif ext_type.extension_name() == b"arrow.bool8": - out = Bool8Type.__new__(Bool8Type) + elif extension_name == b"arrow.uuid": + out = UuidType.__new__(UuidType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 72ae2aee5f8..68f77832c43 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -17,6 +17,7 @@ import collections from cython cimport binding +from uuid import UUID cdef class Scalar(_Weakrefable): @@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class UuidScalar(ExtensionScalar): + """ + Concrete class for Uuid extension scalar. + """ + + def as_py(self): + return None if self.value is None else UUID(bytes=self.value.as_py()) + + cdef class FixedShapeTensorScalar(ExtensionScalar): """ Concrete class for fixed shape tensor extension scalar. diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc index 6941769e4ef..7c58bae3342 100644 --- a/python/pyarrow/src/arrow/python/gdb.cc +++ b/python/pyarrow/src/arrow/python/gdb.cc @@ -22,7 +22,7 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/datum.h" -#include "arrow/extension_type.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/json_simple.h" #include "arrow/python/gdb.h" #include "arrow/record_batch.h" @@ -37,6 +37,8 @@ namespace arrow { +using extension::uuid; +using extension::UuidType; using ipc::internal::json::ArrayFromJSON; using ipc::internal::json::ChunkedArrayFromJSON; using ipc::internal::json::ScalarFromJSON; @@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail { std::string ToString() const override { return "This is a detail"; } }; -class UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override { - return (other.extension_name() == this->extension_name()); - } - - std::shared_ptr MakeArray(std::shared_ptr data) const override { - return std::make_shared(data); - } - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override { - return Status::NotImplemented(""); - } - - std::string Serialize() const override { return "uuid-serialized"; } -}; - std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, std::string_view json, int64_t offset = 0, int64_t length = -1) { diff --git a/python/pyarrow/tests/extensions.pyx b/python/pyarrow/tests/extensions.pyx index c1bf9aae1ec..309b574dc02 100644 --- a/python/pyarrow/tests/extensions.pyx +++ b/python/pyarrow/tests/extensions.pyx @@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil: class UuidType : public ExtensionType { public: UuidType() : ExtensionType(fixed_size_binary(16)) {} - std::string extension_name() const override { return "uuid"; } + std::string extension_name() const override { return "example-uuid"; } bool ExtensionEquals(const ExtensionType& other) const override { return other.extension_name() == this->extension_name(); diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 0d50c467e96..aacbd2cb6e7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -95,18 +95,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidScalarType(pa.ExtensionScalar): +class ExampleUuidScalarType(pa.ExtensionScalar): def as_py(self): return None if self.value is None else UUID(bytes=self.value.as_py()) -class UuidType(pa.ExtensionType): +class ExampleUuidType(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType') + + def __reduce__(self): + return ExampleUuidType, () def __arrow_ext_scalar_class__(self): - return UuidScalarType + return ExampleUuidScalarType def __arrow_ext_serialize__(self): return b'' @@ -116,10 +119,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidType2(pa.ExtensionType): +class ExampleUuidType2(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2') def __arrow_ext_serialize__(self): return b'' @@ -250,8 +253,8 @@ def ipc_read_batch(buf): def test_ext_type_basics(): - ty = UuidType() - assert ty.extension_name == "pyarrow.tests.UuidType" + ty = ExampleUuidType() + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" def test_ext_type_str(): @@ -267,16 +270,16 @@ def test_ext_type_repr(): def test_ext_type_lifetime(): - ty = UuidType() + ty = ExampleUuidType() wr = weakref.ref(ty) del ty assert wr() is None def test_ext_type_storage_type(): - ty = UuidType() + ty = ExampleUuidType() assert ty.storage_type == pa.binary(16) - assert ty.__class__ is UuidType + assert ty.__class__ is ExampleUuidType ty = ParamExtType(5) assert ty.storage_type == pa.binary(5) assert ty.__class__ is ParamExtType @@ -284,7 +287,7 @@ def test_ext_type_storage_type(): def test_ext_type_byte_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.byte_width == 16 ty = ParamExtType(5) assert ty.byte_width == 5 @@ -297,7 +300,7 @@ def test_ext_type_byte_width(): def test_ext_type_bit_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.bit_width == 128 ty = ParamExtType(5) assert ty.bit_width == 40 @@ -309,7 +312,7 @@ def test_ext_type_bit_width(): def test_ext_type_as_py(): - ty = UuidType() + ty = ExampleUuidType() expected = uuid4() scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes) assert scalar.as_py() == expected @@ -342,12 +345,22 @@ def test_ext_type_as_py(): def test_uuid_type_pickle(pickle_module): for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): - ty = UuidType() + ty = ExampleUuidType() ser = pickle_module.dumps(ty, protocol=proto) del ty ty = pickle_module.loads(ser) wr = weakref.ref(ty) - assert ty.extension_name == "pyarrow.tests.UuidType" + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" + del ty + assert wr() is None + + for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): + ty = pa.uuid() + ser = pickle_module.dumps(ty, protocol=proto) + del ty + ty = pickle_module.loads(ser) + wr = weakref.ref(ty) + assert ty.extension_name == "arrow.uuid" del ty assert wr() is None @@ -358,8 +371,8 @@ def test_ext_type_equality(): c = ParamExtType(6) assert a != b assert b == c - d = UuidType() - e = UuidType() + d = ExampleUuidType() + e = ExampleUuidType() assert a != d assert d == e @@ -403,7 +416,7 @@ def test_ext_array_equality(): storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage3 = pa.array([], type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) a = pa.ExtensionArray.from_storage(ty1, storage1) @@ -451,9 +464,9 @@ def test_ext_scalar_from_array(): data = [b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None] storage = pa.array(data, type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) - ty3 = UuidType2() + ty3 = ExampleUuidType2() a = pa.ExtensionArray.from_storage(ty1, storage) b = pa.ExtensionArray.from_storage(ty2, storage) @@ -462,9 +475,9 @@ def test_ext_scalar_from_array(): scalars_a = list(a) assert len(scalars_a) == 4 - assert ty1.__arrow_ext_scalar_class__() == UuidScalarType - assert isinstance(a[0], UuidScalarType) - assert isinstance(scalars_a[0], UuidScalarType) + assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType + assert isinstance(a[0], ExampleUuidScalarType) + assert isinstance(scalars_a[0], ExampleUuidScalarType) for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) @@ -505,7 +518,7 @@ def test_ext_scalar_from_array(): def test_ext_scalar_from_storage(): - ty = UuidType() + ty = ExampleUuidType() s = pa.ExtensionScalar.from_storage(ty, None) assert isinstance(s, pa.ExtensionScalar) @@ -706,14 +719,14 @@ def test_cast_between_extension_types(): tiny_int_arr.cast(pa.int64()).cast(IntegerType()) # Between the same extension types is okay - array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType()) - out = array.cast(UuidType()) - assert out.type == UuidType() + array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(ExampleUuidType()) + out = array.cast(ExampleUuidType()) + assert out.type == ExampleUuidType() # Will still fail casting between extensions who share storage type, # can only cast between exactly the same extension types. with pytest.raises(TypeError, match='Casting from *'): - array.cast(UuidType2()) + array.cast(ExampleUuidType2()) def test_cast_to_extension_with_extension_storage(): @@ -744,10 +757,10 @@ def test_cast_nested_extension_types(data, type_factory): def test_casting_dict_array_to_extension_type(): storage = pa.array([b"0123456789abcdef"], type=pa.binary(16)) - arr = pa.ExtensionArray.from_storage(UuidType(), storage) + arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage) dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()), arr) - out = dict_arr.cast(UuidType()) + out = dict_arr.cast(ExampleUuidType()) assert isinstance(out, pa.ExtensionArray) assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'), UUID('30313233-3435-3637-3839-616263646566')] @@ -1347,7 +1360,7 @@ def test_cpp_extension_in_python(tmpdir): mod = __import__('extensions') uuid_type = mod._make_uuid_type() - assert uuid_type.extension_name == "uuid" + assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) array = mod._make_uuid_array() @@ -1356,6 +1369,31 @@ def test_cpp_extension_in_python(tmpdir): assert array[0].as_py() == b'abcdefghijklmno0' assert array[1].as_py() == b'0onmlkjihgfedcba' + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["example-uuid"])) + + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == uuid_type + assert reconstructed_array == array + + +def test_uuid_extension(): + data = [b"0123456789abcdef", b"0123456789abcdef", + b"zyxwvutsrqponmlk", None] + + uuid_type = pa.uuid() + assert uuid_type.extension_name == "arrow.uuid" + assert uuid_type.storage_type == pa.binary(16) + assert uuid_type.__class__ is pa.UuidType + + storage = pa.array(data, pa.binary(16)) + array = pa.ExtensionArray.from_storage(uuid_type, storage) + assert array.type == uuid_type + + assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in data] + assert array[0].as_py() == UUID(bytes=data[0]) + assert array[3].as_py() is None + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"])) batch = ipc_read_batch(buf) @@ -1363,6 +1401,9 @@ def test_cpp_extension_in_python(tmpdir): assert reconstructed_array.type == uuid_type assert reconstructed_array == array + assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar + assert isinstance(array[0], pa.UuidScalar) + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 0d12d710dcf..2ac2f55754f 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow): check_stack_repr( gdb_arrow, "uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow): check_heap_repr( gdb_arrow, "heap_uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow): check_stack_repr( gdb_arrow, "extension_scalar", - ('arrow::ExtensionScalar of type "extension", ' + ('arrow::ExtensionScalar of type "extension", ' 'value arrow::FixedSizeBinaryScalar of size 16, ' 'value "0123456789abcdef"')) check_stack_repr( gdb_arrow, "extension_scalar_null", - 'arrow::ExtensionScalar of type "extension", null value') + 'arrow::ExtensionScalar of type "extension", null value') def test_scalars_heap(gdb_arrow): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 563782f0c26..f83ecc3aa43 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1765,6 +1765,25 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.uuid_ext_type = type.get() + + def __arrow_ext_class__(self): + return UuidArray + + def __reduce__(self): + return uuid, () + + def __arrow_ext_scalar_class__(self): + return UuidScalar + + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -5208,6 +5227,21 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def uuid(): + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + + cdef UuidType out = UuidType.__new__(UuidType) + c_uuid_ext_type = GetResultValue(CUuidType.Make()) + out.init(c_uuid_ext_type) + return out + + def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): """ Create instance of fixed shape tensor extension type with shape and optional