From 65c5590bb4f1f6ed57c1c9c2dc4cc55f898bb6cb Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 2 Jul 2025 16:14:28 +0100 Subject: [PATCH 1/5] feat[encodings/dict]: allow different nullability of codes and values in dict Signed-off-by: Joe Isaacs --- encodings/dict/src/array.rs | 20 ++------------------ encodings/dict/src/serde.rs | 12 ++++++++++-- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index b7cb2a15119..ffbe606dd51 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -1,7 +1,7 @@ use std::fmt::Debug; use arrow_buffer::BooleanBuffer; -use vortex_array::compute::{cast, take}; +use vortex_array::compute::take; use vortex_array::stats::{ArrayStats, StatsSetRef}; use vortex_array::vtable::{ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityVTable}; use vortex_array::{ @@ -46,27 +46,11 @@ pub struct DictArray { pub struct DictEncoding; impl DictArray { - pub fn try_new(mut codes: ArrayRef, values: ArrayRef) -> VortexResult { + pub fn try_new(codes: ArrayRef, values: ArrayRef) -> VortexResult { if !codes.dtype().is_unsigned_int() { vortex_bail!(MismatchedTypes: "unsigned int", codes.dtype()); } - let dtype = values.dtype(); - if dtype.is_nullable() { - // If the values are nullable, we force codes to be nullable as well. - codes = cast(&codes, &codes.dtype().as_nullable())?; - } else { - // If the values are non-nullable, we assert the codes are non-nullable as well. - if codes.dtype().is_nullable() { - vortex_bail!("Cannot have nullable codes for non-nullable dict array"); - } - } - assert_eq!( - codes.dtype().nullability(), - values.dtype().nullability(), - "Mismatched nullability between codes and values" - ); - Ok(Self { codes, values, diff --git a/encodings/dict/src/serde.rs b/encodings/dict/src/serde.rs index d74b987987d..c7b178f9061 100644 --- a/encodings/dict/src/serde.rs +++ b/encodings/dict/src/serde.rs @@ -4,7 +4,7 @@ use vortex_array::{ Array, ArrayBufferVisitor, ArrayChildVisitor, Canonical, DeserializeMetadata, ProstMetadata, }; use vortex_buffer::ByteBuffer; -use vortex_dtype::{DType, PType}; +use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err}; use crate::builders::dict_encode; @@ -16,6 +16,8 @@ pub struct DictMetadata { values_len: u32, #[prost(enumeration = "PType", tag = "2")] codes_ptype: i32, + #[prost(optional, bool, tag = "3")] + is_nullable_codes: Option, } impl SerdeVTable for DictVTable { @@ -28,6 +30,7 @@ impl SerdeVTable for DictVTable { values_len: u32::try_from(array.values().len()).map_err(|_| { vortex_err!("Diction values cannot exceed u32 in length for serialization") })?, + is_nullable_codes: Some(array.codes().dtype().is_nullable()), }))) } @@ -45,7 +48,11 @@ impl SerdeVTable for DictVTable { children.len() ) } - let codes_dtype = DType::Primitive(metadata.codes_ptype(), dtype.nullability()); + let codes_nullable: Nullability = metadata + .is_nullable_codes + .unwrap_or_else(|| dtype.is_nullable()) + .into(); + let codes_dtype = DType::Primitive(metadata.codes_ptype(), codes_nullable); let codes = children.get(0, &codes_dtype, len)?; let values = children.get(1, dtype, metadata.values_len as usize)?; @@ -89,6 +96,7 @@ mod test { ProstMetadata(DictMetadata { codes_ptype: PType::U64 as i32, values_len: u32::MAX, + is_nullable_codes: None, }), ); } From bf220f188c9203328555efb5105f76c0d1bdefa2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 2 Jul 2025 16:17:15 +0100 Subject: [PATCH 2/5] u Signed-off-by: Joe Isaacs --- encodings/dict/src/serde.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/encodings/dict/src/serde.rs b/encodings/dict/src/serde.rs index c7b178f9061..b2ca22d7e30 100644 --- a/encodings/dict/src/serde.rs +++ b/encodings/dict/src/serde.rs @@ -16,6 +16,7 @@ pub struct DictMetadata { values_len: u32, #[prost(enumeration = "PType", tag = "2")] codes_ptype: i32, + // nullable codes are optional since they were added after stabilisation #[prost(optional, bool, tag = "3")] is_nullable_codes: Option, } @@ -50,6 +51,8 @@ impl SerdeVTable for DictVTable { } let codes_nullable: Nullability = metadata .is_nullable_codes + // The old behaviour of (without `is_nullable_codes` metadata) used the nullability + // of the values (and whole array). .unwrap_or_else(|| dtype.is_nullable()) .into(); let codes_dtype = DType::Primitive(metadata.codes_ptype(), codes_nullable); From f84f1ddb4b5efe7a5f21bf9c3237b0862b921a14 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 1 Sep 2025 12:57:49 +0100 Subject: [PATCH 3/5] fix Signed-off-by: Joe Isaacs --- encodings/dict/src/array.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index ffbe606dd51..5872f325594 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -231,6 +231,24 @@ mod test { assert_eq!(indices, [2, 4]); } + #[test] + fn nullable_codes_and_non_null_values() { + let dict = DictArray::try_new( + PrimitiveArray::new( + buffer![0u32, 1, 2, 2, 1], + Validity::from(BooleanBuffer::from(vec![true, false, true, false, true])), + ) + .into_array(), + PrimitiveArray::new(buffer![3, 6, 9], Validity::NonNullable).into_array(), + ) + .unwrap(); + let mask = dict.validity_mask().unwrap(); + let AllOr::Some(indices) = mask.indices() else { + vortex_panic!("Expected indices from mask") + }; + assert_eq!(indices, [0, 2, 4]); + } + fn make_dict_primitive_chunks( len: usize, unique_values: usize, From b8496c80ea69e8ab3f1d79d10c09a0e7df654c96 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 1 Sep 2025 13:04:00 +0100 Subject: [PATCH 4/5] fix Signed-off-by: Joe Isaacs --- encodings/dict/src/array.rs | 4 ++-- encodings/dict/src/serde.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 75bb91dd0d9..6937578ce8b 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -11,7 +11,7 @@ use vortex_array::{ Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, ToCanonical, vtable, }; use vortex_dtype::{DType, match_each_integer_ptype}; -use vortex_error::{VortexExpect as _, VortexResult, vortex_bail, vortex_ensure}; +use vortex_error::{VortexExpect as _, VortexResult, vortex_bail}; use vortex_mask::{AllOr, Mask}; vtable!(Dict); @@ -83,7 +83,7 @@ impl DictArray { /// of the `values` array. Otherwise, this constructor returns an error. /// /// It is an error to provide a nullable `codes` with non-nullable `values`. - pub fn try_new(mut codes: ArrayRef, values: ArrayRef) -> VortexResult { + pub fn try_new(codes: ArrayRef, values: ArrayRef) -> VortexResult { if !codes.dtype().is_unsigned_int() { vortex_bail!(MismatchedTypes: "unsigned int", codes.dtype()); } diff --git a/encodings/dict/src/serde.rs b/encodings/dict/src/serde.rs index 5dc36929f47..eb33131fcfa 100644 --- a/encodings/dict/src/serde.rs +++ b/encodings/dict/src/serde.rs @@ -8,7 +8,7 @@ use vortex_array::{ }; use vortex_buffer::ByteBuffer; use vortex_dtype::{DType, Nullability, PType}; -use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err}; +use vortex_error::{VortexResult, vortex_bail, vortex_err}; use crate::builders::dict_encode; use crate::{DictArray, DictEncoding, DictVTable}; From 4e905a24a5eb3173dcda0c061cdd66bb0fb36ba3 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 1 Sep 2025 14:11:00 +0100 Subject: [PATCH 5/5] fix Signed-off-by: Joe Isaacs --- encodings/dict/src/array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 6937578ce8b..718216bb8ee 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -283,7 +283,7 @@ mod test { PrimitiveArray::new(buffer![3, 6, 9], Validity::NonNullable).into_array(), ) .unwrap(); - let mask = dict.validity_mask().unwrap(); + let mask = dict.validity_mask(); let AllOr::Some(indices) = mask.indices() else { vortex_panic!("Expected indices from mask") };