Skip to content

Commit 1ededfe

Browse files
[Variant] Introduce new type over &str for ShortString (#7718)
# Which issue does this PR close? - Closes #7700 This commit introduces `ShortString`, a newtype that wraps around `&str` that enforces a maximum length constraint. This also allows us to perform validation once and removes a superfluous validation check in `append_value`. The now-superflous validation check was needed since users could construct `Variant::ShortString`s directly, without doing input validation. This means you can have a short string variant which actually contains a string that is no longer than 63 bytes. But since we enforce this check upon construction, we can directly match against `Variant::String` and `Variant::ShortString` arms with their respective appending functions (`append_string` and `append_short_string`).
1 parent 7b374b9 commit 1ededfe

File tree

4 files changed

+122
-35
lines changed

4 files changed

+122
-35
lines changed

parquet-variant/src/builder.rs

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
18-
use crate::Variant;
18+
use crate::{ShortString, Variant};
1919
use std::collections::HashMap;
2020

2121
const BASIC_TYPE_BITS: u8 = 2;
22-
const MAX_SHORT_STRING_SIZE: usize = 0x3F;
2322
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
2423

2524
fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 {
@@ -114,11 +113,11 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usi
114113
/// };
115114
/// assert_eq!(
116115
/// variant_object.field_by_name("first_name").unwrap(),
117-
/// Some(Variant::ShortString("Jiaying"))
116+
/// Some(Variant::from("Jiaying"))
118117
/// );
119118
/// assert_eq!(
120119
/// variant_object.field_by_name("last_name").unwrap(),
121-
/// Some(Variant::ShortString("Li"))
120+
/// Some(Variant::from("Li"))
122121
/// );
123122
/// ```
124123
///
@@ -281,17 +280,18 @@ impl VariantBuilder {
281280
self.buffer.extend_from_slice(value);
282281
}
283282

283+
fn append_short_string(&mut self, value: ShortString) {
284+
let inner = value.0;
285+
self.buffer.push(short_string_header(inner.len()));
286+
self.buffer.extend_from_slice(inner.as_bytes());
287+
}
288+
284289
fn append_string(&mut self, value: &str) {
285-
if value.len() <= MAX_SHORT_STRING_SIZE {
286-
self.buffer.push(short_string_header(value.len()));
287-
self.buffer.extend_from_slice(value.as_bytes());
288-
} else {
289-
self.buffer
290-
.push(primitive_header(VariantPrimitiveType::String));
291-
self.buffer
292-
.extend_from_slice(&(value.len() as u32).to_le_bytes());
293-
self.buffer.extend_from_slice(value.as_bytes());
294-
}
290+
self.buffer
291+
.push(primitive_header(VariantPrimitiveType::String));
292+
self.buffer
293+
.extend_from_slice(&(value.len() as u32).to_le_bytes());
294+
self.buffer.extend_from_slice(value.as_bytes());
295295
}
296296

297297
/// Add key to dictionary, return its ID
@@ -390,7 +390,8 @@ impl VariantBuilder {
390390
Variant::Float(v) => self.append_float(v),
391391
Variant::Double(v) => self.append_double(v),
392392
Variant::Binary(v) => self.append_binary(v),
393-
Variant::String(s) | Variant::ShortString(s) => self.append_string(s),
393+
Variant::String(s) => self.append_string(s),
394+
Variant::ShortString(s) => self.append_short_string(s),
394395
Variant::Object(_) | Variant::List(_) => {
395396
unreachable!("Object and List variants cannot be created through Into<Variant>")
396397
}
@@ -639,7 +640,7 @@ mod tests {
639640
builder.append_value("hello");
640641
let (metadata, value) = builder.finish();
641642
let variant = Variant::try_new(&metadata, &value).unwrap();
642-
assert_eq!(variant, Variant::ShortString("hello"));
643+
assert_eq!(variant, Variant::ShortString(ShortString("hello")));
643644
}
644645

645646
{
@@ -688,7 +689,7 @@ mod tests {
688689
assert_eq!(val1, Variant::Int8(2));
689690

690691
let val2 = list.get(2).unwrap();
691-
assert_eq!(val2, Variant::ShortString("test"));
692+
assert_eq!(val2, Variant::ShortString(ShortString("test")));
692693
}
693694
_ => panic!("Expected an array variant, got: {:?}", variant),
694695
}

parquet-variant/src/decoder.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
18+
use crate::ShortString;
1819

1920
use arrow_schema::ArrowError;
2021
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
@@ -273,10 +274,10 @@ pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
273274
}
274275

275276
/// Decodes a short string from the value section of a variant.
276-
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> {
277+
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<ShortString, ArrowError> {
277278
let len = (metadata >> 2) as usize;
278279
let string = string_from_slice(data, 0..len)?;
279-
Ok(string)
280+
ShortString::try_new(string)
280281
}
281282

282283
#[cfg(test)]
@@ -420,7 +421,7 @@ mod tests {
420421
fn test_short_string() -> Result<(), ArrowError> {
421422
let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
422423
let result = decode_short_string(1 | 5 << 2, &data)?;
423-
assert_eq!(result, "Hello");
424+
assert_eq!(result.0, "Hello");
424425
Ok(())
425426
}
426427

parquet-variant/src/variant.rs

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::ops::Deref;
2+
13
// Licensed to the Apache Software Foundation (ASF) under one
24
// or more contributor license agreements. See the NOTICE file
35
// distributed with this work for additional information
@@ -29,6 +31,65 @@ mod list;
2931
mod metadata;
3032
mod object;
3133

34+
const MAX_SHORT_STRING_BYTES: usize = 0x3F;
35+
36+
/// A Variant [`ShortString`]
37+
///
38+
/// This implementation is a zero cost wrapper over `&str` that ensures
39+
/// the length of the underlying string is a valid Variant short string (63 bytes or less)
40+
#[derive(Debug, Clone, Copy, PartialEq)]
41+
pub struct ShortString<'a>(pub(crate) &'a str);
42+
43+
impl<'a> ShortString<'a> {
44+
/// Attempts to interpret `value` as a variant short string value.
45+
///
46+
/// # Validation
47+
///
48+
/// This constructor verifies that `value` is shorter than or equal to `MAX_SHORT_STRING_BYTES`
49+
pub fn try_new(value: &'a str) -> Result<Self, ArrowError> {
50+
if value.len() > MAX_SHORT_STRING_BYTES {
51+
return Err(ArrowError::InvalidArgumentError(format!(
52+
"value is larger than {MAX_SHORT_STRING_BYTES} bytes"
53+
)));
54+
}
55+
56+
Ok(Self(value))
57+
}
58+
59+
/// Returns the underlying Variant short string as a &str
60+
pub fn as_str(&self) -> &'a str {
61+
self.0
62+
}
63+
}
64+
65+
impl<'a> From<ShortString<'a>> for &'a str {
66+
fn from(value: ShortString<'a>) -> Self {
67+
value.0
68+
}
69+
}
70+
71+
impl<'a> TryFrom<&'a str> for ShortString<'a> {
72+
type Error = ArrowError;
73+
74+
fn try_from(value: &'a str) -> Result<Self, Self::Error> {
75+
Self::try_new(value)
76+
}
77+
}
78+
79+
impl<'a> AsRef<str> for ShortString<'a> {
80+
fn as_ref(&self) -> &str {
81+
self.0
82+
}
83+
}
84+
85+
impl<'a> Deref for ShortString<'a> {
86+
type Target = str;
87+
88+
fn deref(&self) -> &Self::Target {
89+
self.0
90+
}
91+
}
92+
3293
/// Represents a [Parquet Variant]
3394
///
3495
/// The lifetimes `'m` and `'v` are for metadata and value buffers, respectively.
@@ -85,7 +146,7 @@ mod object;
85146
///
86147
/// ## Creating `Variant` from Rust Types
87148
/// ```
88-
/// # use parquet_variant::Variant;
149+
/// use parquet_variant::Variant;
89150
/// // variants can be directly constructed
90151
/// let variant = Variant::Int32(123);
91152
/// // or constructed via `From` impls
@@ -98,7 +159,7 @@ mod object;
98159
/// let value = [0x09, 0x48, 0x49];
99160
/// // parse the header metadata
100161
/// assert_eq!(
101-
/// Variant::ShortString("HI"),
162+
/// Variant::from("HI"),
102163
/// Variant::try_new(&metadata, &value).unwrap()
103164
/// );
104165
/// ```
@@ -152,7 +213,7 @@ pub enum Variant<'m, 'v> {
152213
/// Primitive (type_id=1): STRING
153214
String(&'v str),
154215
/// Short String (type_id=2): STRING
155-
ShortString(&'v str),
216+
ShortString(ShortString<'v>),
156217
// need both metadata & value
157218
/// Object (type_id=3): N/A
158219
Object(VariantObject<'m, 'v>),
@@ -165,12 +226,12 @@ impl<'m, 'v> Variant<'m, 'v> {
165226
///
166227
/// # Example
167228
/// ```
168-
/// # use parquet_variant::{Variant, VariantMetadata};
229+
/// use parquet_variant::{Variant, VariantMetadata};
169230
/// let metadata = [0x01, 0x00, 0x00];
170231
/// let value = [0x09, 0x48, 0x49];
171232
/// // parse the header metadata
172233
/// assert_eq!(
173-
/// Variant::ShortString("HI"),
234+
/// Variant::from("HI"),
174235
/// Variant::try_new(&metadata, &value).unwrap()
175236
/// );
176237
/// ```
@@ -189,7 +250,7 @@ impl<'m, 'v> Variant<'m, 'v> {
189250
/// // parse the header metadata first
190251
/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
191252
/// assert_eq!(
192-
/// Variant::ShortString("HI"),
253+
/// Variant::from("HI"),
193254
/// Variant::try_new_with_metadata(metadata, &value).unwrap()
194255
/// );
195256
/// ```
@@ -432,7 +493,7 @@ impl<'m, 'v> Variant<'m, 'v> {
432493
///
433494
/// // you can extract a string from string variants
434495
/// let s = "hello!";
435-
/// let v1 = Variant::ShortString(s);
496+
/// let v1 = Variant::from(s);
436497
/// assert_eq!(v1.as_string(), Some(s));
437498
///
438499
/// // but not from other variants
@@ -441,7 +502,7 @@ impl<'m, 'v> Variant<'m, 'v> {
441502
/// ```
442503
pub fn as_string(&'v self) -> Option<&'v str> {
443504
match self {
444-
Variant::String(s) | Variant::ShortString(s) => Some(s),
505+
Variant::String(s) | Variant::ShortString(ShortString(s)) => Some(s),
445506
_ => None,
446507
}
447508
}
@@ -861,10 +922,25 @@ impl<'v> From<&'v [u8]> for Variant<'_, 'v> {
861922

862923
impl<'v> From<&'v str> for Variant<'_, 'v> {
863924
fn from(value: &'v str) -> Self {
864-
if value.len() < 64 {
865-
Variant::ShortString(value)
866-
} else {
925+
if value.len() > MAX_SHORT_STRING_BYTES {
867926
Variant::String(value)
927+
} else {
928+
Variant::ShortString(ShortString(value))
868929
}
869930
}
870931
}
932+
933+
#[cfg(test)]
934+
mod tests {
935+
use super::*;
936+
937+
#[test]
938+
fn test_construct_short_string() {
939+
let short_string = ShortString::try_new("norm").expect("should fit in short string");
940+
assert_eq!(short_string.as_str(), "norm");
941+
942+
let long_string = "a".repeat(MAX_SHORT_STRING_BYTES + 1);
943+
let res = ShortString::try_new(&long_string);
944+
assert!(res.is_err());
945+
}
946+
}

parquet-variant/tests/variant_interop.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use std::fs;
2424
use std::path::{Path, PathBuf};
2525

2626
use chrono::NaiveDate;
27-
use parquet_variant::{Variant, VariantBuilder};
27+
use parquet_variant::{ShortString, Variant, VariantBuilder};
2828

2929
fn cases_dir() -> PathBuf {
3030
Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -76,7 +76,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> {
7676
("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")),
7777
("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())),
7878
("primitive_timestampntz", Variant::TimestampNtzMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())),
79-
("short_string", Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)")),
79+
("short_string", Variant::ShortString(ShortString::try_new("Less than 64 bytes (❤\u{fe0f} with utf8)").unwrap())),
8080
]
8181
}
8282
#[test]
@@ -130,11 +130,20 @@ fn variant_object_primitive() {
130130
),
131131
("int_field", Variant::Int8(1)),
132132
("null_field", Variant::Null),
133-
("string_field", Variant::ShortString("Apache Parquet")),
133+
(
134+
"string_field",
135+
Variant::ShortString(
136+
ShortString::try_new("Apache Parquet")
137+
.expect("value should fit inside a short string"),
138+
),
139+
),
134140
(
135141
// apparently spark wrote this as a string (not a timestamp)
136142
"timestamp_field",
137-
Variant::ShortString("2025-04-16T12:34:56.78"),
143+
Variant::ShortString(
144+
ShortString::try_new("2025-04-16T12:34:56.78")
145+
.expect("value should fit inside a short string"),
146+
),
138147
),
139148
];
140149
let actual_fields: Vec<_> = variant_object.iter().collect();

0 commit comments

Comments
 (0)