From 0258b2f9d94d8546d46b8825f6dc5af965ed2da0 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 4 May 2022 13:19:57 -0700 Subject: [PATCH 1/5] Impl Eq for Field --- arrow/src/datatypes/field.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index 7ad790268ff6..47a6ede300cf 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -27,7 +27,7 @@ use super::DataType; /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Debug, Clone, Hash, PartialOrd, Ord)] pub struct Field { name: String, data_type: DataType, @@ -39,6 +39,17 @@ pub struct Field { metadata: Option>, } +impl PartialEq for Field { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.data_type == other.data_type + && self.nullable == other.nullable + && self.metadata == other.metadata + } +} + +impl Eq for Field {} + impl Field { /// Creates a new field pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { From b55f4854a6066286e322a11fe67cd9de0e8f0377 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 4 May 2022 14:10:52 -0700 Subject: [PATCH 2/5] Impl Hash --- arrow/src/datatypes/field.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index 47a6ede300cf..90d9a86cf29e 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -16,6 +16,7 @@ // under the License. use std::collections::BTreeMap; +use std::hash::{Hash, Hasher}; use serde_derive::{Deserialize, Serialize}; use serde_json::{json, Value}; @@ -27,7 +28,7 @@ use super::DataType; /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, Hash, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialOrd, Ord)] pub struct Field { name: String, data_type: DataType, @@ -50,6 +51,15 @@ impl PartialEq for Field { impl Eq for Field {} +impl Hash for Field { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.data_type.hash(state); + self.nullable.hash(state); + self.metadata.hash(state); + } +} + impl Field { /// Creates a new field pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { From 7a08b938b20547e5a893faa9fc785b3acd4e4226 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 5 May 2022 16:53:05 -0700 Subject: [PATCH 3/5] Impl PartialOrd for Field --- arrow/src/datatypes/field.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index 90d9a86cf29e..4223fa01e677 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::cmp::Ordering; use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; @@ -28,7 +29,7 @@ use super::DataType; /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Debug, Clone, Ord)] pub struct Field { name: String, data_type: DataType, @@ -51,6 +52,24 @@ impl PartialEq for Field { impl Eq for Field {} +impl PartialOrd for Field { + fn partial_cmp(&self, other: &Self) -> Option { + let mut ord = self.name.partial_cmp(other.name()); + if let Some(Ordering::Equal) = ord { + ord = self.data_type.partial_cmp(other.data_type()); + + if let Some(Ordering::Equal) = ord { + ord = self.nullable.partial_cmp(&other.nullable); + + if let Some(Ordering::Equal) = ord { + ord = self.metadata.partial_cmp(&other.metadata); + } + } + } + ord + } +} + impl Hash for Field { fn hash(&self, state: &mut H) { self.name.hash(state); From 1484d5697f9b53001481f32dfcd21389818f412e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 5 May 2022 17:25:37 -0700 Subject: [PATCH 4/5] Impl Ord instead --- arrow/src/datatypes/field.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index 4223fa01e677..d428a293a61f 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -29,7 +29,7 @@ use super::DataType; /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, Ord)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Field { name: String, data_type: DataType, @@ -54,15 +54,21 @@ impl Eq for Field {} impl PartialOrd for Field { fn partial_cmp(&self, other: &Self) -> Option { - let mut ord = self.name.partial_cmp(other.name()); - if let Some(Ordering::Equal) = ord { - ord = self.data_type.partial_cmp(other.data_type()); + Some(self.cmp(other)) + } +} + +impl Ord for Field { + fn cmp(&self, other: &Self) -> Ordering { + let mut ord = self.name.cmp(other.name()); + if let Ordering::Equal = ord { + ord = self.data_type.cmp(other.data_type()); - if let Some(Ordering::Equal) = ord { - ord = self.nullable.partial_cmp(&other.nullable); + if let Ordering::Equal = ord { + ord = self.nullable.cmp(&other.nullable); - if let Some(Ordering::Equal) = ord { - ord = self.metadata.partial_cmp(&other.metadata); + if let Ordering::Equal = ord { + ord = self.metadata.cmp(&other.metadata); } } } From f6918128c9e9978a15ce63014f4c718e27355c82 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 6 May 2022 10:06:34 -0700 Subject: [PATCH 5/5] Add comment and test. --- arrow/src/datatypes/field.rs | 64 ++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index d428a293a61f..ded7fc67b6a8 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -41,6 +41,11 @@ pub struct Field { metadata: Option>, } +// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` +// into comparison. However, these properties are only used in IPC context +// for matching dictionary encoded data. They are not necessary to be same +// to consider schema equality. For example, in C++ `Field` implementation, +// it doesn't contain these dictionary properties too. impl PartialEq for Field { fn eq(&self, other: &Self) -> bool { self.name == other.name @@ -60,19 +65,11 @@ impl PartialOrd for Field { impl Ord for Field { fn cmp(&self, other: &Self) -> Ordering { - let mut ord = self.name.cmp(other.name()); - if let Ordering::Equal = ord { - ord = self.data_type.cmp(other.data_type()); - - if let Ordering::Equal = ord { - ord = self.nullable.cmp(&other.nullable); - - if let Ordering::Equal = ord { - ord = self.metadata.cmp(&other.metadata); - } - } - } - ord + self.name + .cmp(other.name()) + .then(self.data_type.cmp(other.data_type())) + .then(self.nullable.cmp(&other.nullable)) + .then(self.metadata.cmp(&other.metadata)) } } @@ -669,6 +666,8 @@ impl std::fmt::Display for Field { #[cfg(test)] mod test { use super::{DataType, Field}; + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; #[test] fn test_fields_with_dict_id() { @@ -722,4 +721,43 @@ mod test { assert_eq!(dict2, *field); } } + + fn get_field_hash(field: &Field) -> u64 { + let mut s = DefaultHasher::new(); + field.hash(&mut s); + s.finish() + } + + #[test] + fn test_field_comparison_case() { + // dictionary-encoding properties not used for field comparison + let dict1 = Field::new_dict( + "dict1", + DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), + false, + 10, + false, + ); + let dict2 = Field::new_dict( + "dict1", + DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), + false, + 20, + false, + ); + + assert_eq!(dict1, dict2); + assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); + + let dict1 = Field::new_dict( + "dict0", + DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), + false, + 10, + false, + ); + + assert_ne!(dict1, dict2); + assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); + } }