Skip to content

Commit e04fafb

Browse files
aihuaxupitrou
authored andcommitted
GH-47838: [C++][Parquet] Set Variant specification version to 1 to align with the variant spec (#47835)
### Rationale for this change According to the [Variant specification](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md), the specification_version field must be set to 1 to indicate Variant encoding version 1. Currently, this field defaults to 0, which violates the specification. Parquet readers that strictly enforce specification version validation will fail to read files containing Variant types. <img width="624" height="185" alt="image" src="https://github.com/user-attachments/assets/b0f1deb9-0301-4b94-a472-17fd9cc0df5d" /> ### What changes are included in this PR? The change includes defaulting the specification version to 1. ### Are these changes tested? The change is covered by unit test. ### Are there any user-facing changes? The Parquet files produced the variant logical type annotation `VARIANT(1)`. ``` Schema: message schema { optional group V (VARIANT(1)) = 1 { required binary metadata; required binary value; } } ``` * GitHub Issue: #47838 Lead-authored-by: Aihua <aihua.xu@snowflake.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 07e4d5a commit e04fafb

File tree

3 files changed

+92
-11
lines changed

3 files changed

+92
-11
lines changed

cpp/src/parquet/schema_test.cc

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1580,7 +1580,8 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) {
15801580
LogicalType::EdgeInterpolationAlgorithm::KARNEY),
15811581
"Geography(crs=srid:1234, algorithm=karney)",
15821582
R"({"Type": "Geography", "crs": "srid:1234", "algorithm": "karney"})"},
1583-
{LogicalType::Variant(), "Variant", R"({"Type": "Variant"})"},
1583+
{LogicalType::Variant(), "Variant(1)", R"({"Type": "Variant", "SpecVersion": 1})"},
1584+
{LogicalType::Variant(2), "Variant(2)", R"({"Type": "Variant", "SpecVersion": 2})"},
15841585
{LogicalType::None(), "None", R"({"Type": "None"})"},
15851586
};
15861587

@@ -2353,6 +2354,37 @@ TEST(TestLogicalTypeSerialization, Roundtrips) {
23532354
// Group nodes ...
23542355
ConfirmGroupNodeRoundtrip("map", LogicalType::Map());
23552356
ConfirmGroupNodeRoundtrip("list", LogicalType::List());
2357+
ConfirmGroupNodeRoundtrip("variant", LogicalType::Variant());
2358+
}
2359+
2360+
TEST(TestLogicalTypeSerialization, VariantSpecificationVersion) {
2361+
// Confirm that Variant logical type sets specification_version to expected value in
2362+
// thrift serialization
2363+
constexpr int8_t spec_version = 2;
2364+
auto metadata = PrimitiveNode::Make("metadata", Repetition::REQUIRED, Type::BYTE_ARRAY);
2365+
auto value = PrimitiveNode::Make("value", Repetition::REQUIRED, Type::BYTE_ARRAY);
2366+
NodePtr variant_node =
2367+
GroupNode::Make("variant", Repetition::REQUIRED, {metadata, value},
2368+
LogicalType::Variant(spec_version));
2369+
2370+
// Verify variant logical type
2371+
auto logical_type = variant_node->logical_type();
2372+
ASSERT_TRUE(logical_type->is_variant());
2373+
const auto& variant_type = checked_cast<const VariantLogicalType&>(*logical_type);
2374+
ASSERT_EQ(variant_type.spec_version(), spec_version);
2375+
2376+
// Verify thrift serialization
2377+
std::vector<format::SchemaElement> elements;
2378+
ToParquet(reinterpret_cast<GroupNode*>(variant_node.get()), &elements);
2379+
2380+
// Verify that logicalType is set and is VARIANT
2381+
ASSERT_EQ(elements[0].name, "variant");
2382+
ASSERT_TRUE(elements[0].__isset.logicalType);
2383+
ASSERT_TRUE(elements[0].logicalType.__isset.VARIANT);
2384+
2385+
// Verify that specification_version is set properly
2386+
ASSERT_TRUE(elements[0].logicalType.VARIANT.__isset.specification_version);
2387+
ASSERT_EQ(elements[0].logicalType.VARIANT.specification_version, spec_version);
23562388
}
23572389

23582390
} // namespace schema

cpp/src/parquet/types.cc

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,12 @@ std::shared_ptr<const LogicalType> LogicalType::FromThrift(
591591

592592
return GeographyLogicalType::Make(std::move(crs), algorithm);
593593
} else if (type.__isset.VARIANT) {
594-
return VariantLogicalType::Make();
594+
int8_t spec_version = kVariantSpecVersion;
595+
if (type.VARIANT.__isset.specification_version) {
596+
spec_version = type.VARIANT.specification_version;
597+
}
598+
599+
return VariantLogicalType::Make(spec_version);
595600
} else {
596601
// Sentinel type for one we do not recognize
597602
return UndefinedLogicalType::Make();
@@ -659,8 +664,8 @@ std::shared_ptr<const LogicalType> LogicalType::Geography(
659664
return GeographyLogicalType::Make(std::move(crs), algorithm);
660665
}
661666

662-
std::shared_ptr<const LogicalType> LogicalType::Variant() {
663-
return VariantLogicalType::Make();
667+
std::shared_ptr<const LogicalType> LogicalType::Variant(int8_t spec_version) {
668+
return VariantLogicalType::Make(spec_version);
664669
}
665670

666671
std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
@@ -1958,16 +1963,53 @@ class LogicalType::Impl::Variant final : public LogicalType::Impl::Incompatible,
19581963
public:
19591964
friend class VariantLogicalType;
19601965

1961-
OVERRIDE_TOSTRING(Variant)
1962-
OVERRIDE_TOTHRIFT(VariantType, VARIANT)
1966+
std::string ToString() const override;
1967+
std::string ToJSON() const override;
1968+
format::LogicalType ToThrift() const override;
1969+
1970+
int8_t spec_version() const { return spec_version_; }
19631971

19641972
private:
1965-
Variant()
1973+
explicit Variant(const int8_t spec_version)
19661974
: LogicalType::Impl(LogicalType::Type::VARIANT, SortOrder::UNKNOWN),
1967-
LogicalType::Impl::Inapplicable() {}
1975+
LogicalType::Impl::Inapplicable() {
1976+
this->spec_version_ = spec_version;
1977+
}
1978+
1979+
int8_t spec_version_;
19681980
};
19691981

1970-
GENERATE_MAKE(Variant)
1982+
int8_t VariantLogicalType::spec_version() const {
1983+
return (dynamic_cast<const LogicalType::Impl::Variant&>(*impl_)).spec_version();
1984+
}
1985+
1986+
std::string LogicalType::Impl::Variant::ToString() const {
1987+
std::stringstream type;
1988+
type << "Variant(" << static_cast<int>(spec_version_) << ")";
1989+
return type.str();
1990+
}
1991+
1992+
std::string LogicalType::Impl::Variant::ToJSON() const {
1993+
std::stringstream json;
1994+
json << R"({"Type": "Variant", "SpecVersion": )" << static_cast<int>(spec_version_)
1995+
<< "}";
1996+
1997+
return json.str();
1998+
}
1999+
2000+
format::LogicalType LogicalType::Impl::Variant::ToThrift() const {
2001+
format::LogicalType type;
2002+
format::VariantType variant_type;
2003+
variant_type.__set_specification_version(spec_version_);
2004+
type.__set_VARIANT(variant_type);
2005+
return type;
2006+
}
2007+
2008+
std::shared_ptr<const LogicalType> VariantLogicalType::Make(const int8_t spec_version) {
2009+
auto logical_type = std::shared_ptr<VariantLogicalType>(new VariantLogicalType());
2010+
logical_type->impl_.reset(new LogicalType::Impl::Variant(spec_version));
2011+
return logical_type;
2012+
}
19712013

19722014
class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
19732015
public LogicalType::Impl::UniversalApplicable {

cpp/src/parquet/types.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ class PARQUET_EXPORT LogicalType {
178178
KARNEY = 5
179179
};
180180

181+
/// \brief The latest supported Variant specification version by this library
182+
static constexpr int8_t kVariantSpecVersion = 1;
183+
181184
/// \brief If possible, return a logical type equivalent to the given legacy
182185
/// converted type (and decimal metadata if applicable).
183186
static std::shared_ptr<const LogicalType> FromConvertedType(
@@ -224,7 +227,8 @@ class PARQUET_EXPORT LogicalType {
224227
static std::shared_ptr<const LogicalType> BSON();
225228
static std::shared_ptr<const LogicalType> UUID();
226229
static std::shared_ptr<const LogicalType> Float16();
227-
static std::shared_ptr<const LogicalType> Variant();
230+
static std::shared_ptr<const LogicalType> Variant(
231+
int8_t specVersion = kVariantSpecVersion);
228232

229233
static std::shared_ptr<const LogicalType> Geometry(std::string crs = "");
230234

@@ -495,7 +499,10 @@ class PARQUET_EXPORT GeographyLogicalType : public LogicalType {
495499
/// \brief Allowed for group nodes only.
496500
class PARQUET_EXPORT VariantLogicalType : public LogicalType {
497501
public:
498-
static std::shared_ptr<const LogicalType> Make();
502+
static std::shared_ptr<const LogicalType> Make(
503+
int8_t specVersion = kVariantSpecVersion);
504+
505+
int8_t spec_version() const;
499506

500507
private:
501508
VariantLogicalType() = default;

0 commit comments

Comments
 (0)