Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ set(ARROW_SRCS
device.cc
extension_type.cc
extension/bool8.cc
extension/uuid.cc
pretty_print.cc
record_batch.cc
result.cc
Expand Down Expand Up @@ -1225,6 +1226,7 @@ add_subdirectory(testing)
add_subdirectory(array)
add_subdirectory(c)
add_subdirectory(compute)
add_subdirectory(extension)
add_subdirectory(io)
add_subdirectory(tensor)
add_subdirectory(util)
Expand Down Expand Up @@ -1267,7 +1269,6 @@ endif()

if(ARROW_JSON)
add_subdirectory(json)
add_subdirectory(extension)
endif()

if(ARROW_ORC)
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/acero/hash_join_node_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/compute/light_array_internal.h"
#include "arrow/compute/row/row_encoder_internal.h"
#include "arrow/extension/uuid.h"
#include "arrow/testing/extension_type.h"
#include "arrow/testing/generator.h"
#include "arrow/testing/gtest_util.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.

set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)

if(ARROW_JSON)
list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
Expand Down
17 changes: 2 additions & 15 deletions cpp/src/arrow/extension/fixed_shape_tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "arrow/array/array_primitive.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/writer.h"
#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/tensor.h"
#include "arrow/testing/gtest_util.h"
Expand All @@ -33,6 +33,7 @@
namespace arrow {

using FixedShapeTensorType = extension::FixedShapeTensorType;
using arrow::ipc::test::RoundtripBatch;
using extension::fixed_shape_tensor;
using extension::FixedShapeTensorArray;

Expand Down Expand Up @@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
std::string serialized_;
};

auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));

ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());

io::BufferReader reader(complete_ipc_stream);
std::shared_ptr<RecordBatchReader> batch_reader;
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
ASSERT_OK(batch_reader->ReadNext(out));
};

TEST_F(TestExtensionType, CheckDummyRegistration) {
// We need a registered dummy type at runtime to allow for IPC deserialization
auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");
Expand Down
58 changes: 58 additions & 0 deletions cpp/src/arrow/extension/uuid.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <sstream>

#include "arrow/extension_type.h"
#include "arrow/util/logging.h"

#include "arrow/extension/uuid.h"

namespace arrow::extension {

bool UuidType::ExtensionEquals(const ExtensionType& other) const {
return (other.extension_name() == this->extension_name());
}

std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.uuid",
static_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<UuidArray>(data);
}

Result<std::shared_ptr<DataType>> UuidType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
if (!serialized.empty()) {
return Status::Invalid("Unexpected serialized metadata: '", serialized, "'");
}
if (!storage_type->Equals(*fixed_size_binary(16))) {
return Status::Invalid("Invalid storage type for UuidType: ",
storage_type->ToString());
}
return std::make_shared<UuidType>();
}

std::string UuidType::ToString(bool show_metadata) const {
std::stringstream ss;
ss << "extension<" << this->extension_name() << ">";
return ss.str();
}

std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }

} // namespace arrow::extension
61 changes: 61 additions & 0 deletions cpp/src/arrow/extension/uuid.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/extension_type.h"

namespace arrow::extension {

/// \brief UuidArray stores array of UUIDs. Underlying storage type is
/// FixedSizeBinary(16).
class ARROW_EXPORT UuidArray : public ExtensionArray {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add docstrings for these classes, even if they end up very concise?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

public:
using ExtensionArray::ExtensionArray;
};

/// \brief UuidType is a canonical arrow extension type for UUIDs.
/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
/// does not interpret the bytes in any way. Specific UUID version is not
/// required or guaranteed.
class ARROW_EXPORT UuidType : public ExtensionType {
public:
/// \brief Construct a UuidType.
UuidType() : ExtensionType(fixed_size_binary(16)) {}

std::string extension_name() const override { return "arrow.uuid"; }
std::string ToString(bool show_metadata = false) const override;

bool ExtensionEquals(const ExtensionType& other) const override;

/// Create a UuidArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

Result<std::shared_ptr<DataType>> Deserialize(
std::shared_ptr<DataType> storage_type,
const std::string& serialized) const override;

std::string Serialize() const override { return ""; }

/// \brief Create a UuidType instance
static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
};

/// \brief Return a UuidType instance.
ARROW_EXPORT std::shared_ptr<DataType> uuid();

} // namespace arrow::extension
72 changes: 72 additions & 0 deletions cpp/src/arrow/extension/uuid_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension/uuid.h"

#include "arrow/testing/matchers.h"

#include "arrow/io/memory.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/test_common.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/key_value_metadata.h"

#include "arrow/testing/extension_type.h"

namespace arrow {

using arrow::ipc::test::RoundtripBatch;

TEST(TestUuuidExtensionType, ExtensionTypeTest) {
auto type = uuid();
ASSERT_EQ(type->id(), Type::EXTENSION);

const auto& ext_type = static_cast<const ExtensionType&>(*type);
std::string serialized = ext_type.Serialize();

ASSERT_OK_AND_ASSIGN(auto deserialized,
ext_type.Deserialize(fixed_size_binary(16), serialized));
ASSERT_TRUE(deserialized->Equals(*type));
ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
}

TEST(TestUuuidExtensionType, RoundtripBatch) {
auto ext_type = extension::uuid();
auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type);
auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])");
auto ext_arr = ExtensionType::WrapArray(ext_type, arr);

// Pass extension array, expect getting back extension array
std::shared_ptr<RecordBatch> read_batch;
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
RoundtripBatch(batch, &read_batch);
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);

// Pass extension metadata and storage array, expect getting back extension array
std::shared_ptr<RecordBatch> read_batch2;
auto ext_metadata =
key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()},
{"ARROW:extension:metadata", ""}});
ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
/*nullable=*/true, /*metadata=*/ext_metadata);
auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
RoundtripBatch(batch2, &read_batch2);
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
}

} // namespace arrow
4 changes: 2 additions & 2 deletions cpp/src/arrow/extension_type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "arrow/extension/fixed_shape_tensor.h"
#include "arrow/extension/opaque.h"
#endif
#include "arrow/extension/uuid.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
Expand Down Expand Up @@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
// Register canonical extension types

g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};

#ifdef ARROW_JSON
ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
ext_types.push_back(extension::opaque(null(), "", ""));
#endif

// Register canonical extension types
for (const auto& ext_type : ext_types) {
ARROW_CHECK_OK(
g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
Expand Down
19 changes: 4 additions & 15 deletions cpp/src/arrow/extension_type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/options.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/test_common.h"
#include "arrow/ipc/writer.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
Expand All @@ -41,6 +42,8 @@

namespace arrow {

using arrow::ipc::test::RoundtripBatch;

class Parametric1Array : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
Expand Down Expand Up @@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {

class TestExtensionType : public ::testing::Test {
public:
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }

void TearDown() {
if (GetExtensionType("uuid")) {
Expand Down Expand Up @@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
ASSERT_EQ(deserialized->byte_width(), 16);
}

auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));

ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());

io::BufferReader reader(complete_ipc_stream);
std::shared_ptr<RecordBatchReader> batch_reader;
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
ASSERT_OK(batch_reader->ReadNext(out));
};

TEST_F(TestExtensionType, IpcRoundtrip) {
auto ext_arr = ExampleUuid();
auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/integration/json_integration_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {

auto storage_array =
ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array));

AssertArraysEqual(*batch->column(1), NullArray(2));
}
Expand Down
Loading