diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 285a4f035e24..128a4f820665 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -20,8 +20,7 @@ use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, Gen use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; -use hashbrown::hash_map::RawEntryMut; -use hashbrown::HashMap; +use hashbrown::HashTable; use std::any::Any; use std::sync::Arc; @@ -37,12 +36,7 @@ where T: ByteArrayType, { state: ahash::RandomState, - /// Used to provide a lookup from string value to key type - /// - /// Note: usize's hash implementation is not used, instead the raw entry - /// API is used to store keys w.r.t the hash of the strings themselves - /// - dedup: HashMap, + dedup: HashTable, keys_builder: PrimitiveBuilder, values_builder: GenericByteBuilder, @@ -69,7 +63,7 @@ where let values_builder = GenericByteBuilder::::new(); Self { state: Default::default(), - dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()), + dedup: HashTable::with_capacity(keys_builder.capacity()), keys_builder, values_builder, } @@ -123,7 +117,7 @@ where let state = ahash::RandomState::default(); let dict_len = dictionary_values.len(); - let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ()); + let mut dedup = HashTable::with_capacity(dict_len); let values_len = dictionary_values.value_data().len(); let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); @@ -137,15 +131,13 @@ where let value_bytes: &[u8] = value.as_ref(); let hash = state.hash_one(value_bytes); - let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| { - value_bytes == get_bytes(&values_builder, *idx) - }); - - if let RawEntryMut::Vacant(v) = entry { - v.insert_with_hasher(hash, idx, (), |idx| { - state.hash_one(get_bytes(&values_builder, *idx)) - }); - } + dedup + .entry( + hash, + |idx: &usize| value_bytes == get_bytes(&values_builder, *idx), + |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)), + ) + .or_insert(idx); values_builder.append_value(value); } @@ -216,24 +208,21 @@ where let storage = &mut self.values_builder; let hash = state.hash_one(value_bytes); - let entry = self + let idx = *self .dedup - .raw_entry_mut() - .from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx)); - - let key = match entry { - RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()), - RawEntryMut::Vacant(entry) => { + .entry( + hash, + |idx| value_bytes == get_bytes(storage, *idx), + |idx| state.hash_one(get_bytes(storage, *idx)), + ) + .or_insert_with(|| { let idx = storage.len(); storage.append_value(value); + idx + }) + .get(); - entry.insert_with_hasher(hash, idx, (), |idx| { - state.hash_one(get_bytes(storage, *idx)) - }); - - K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)? - } - }; + let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?; self.keys_builder.append_value(key); Ok(key) diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index a804419b5da7..489d4d58122c 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -16,8 +16,7 @@ // under the License. use crate::data_type::AsBytes; -use hashbrown::hash_map::RawEntryMut; -use hashbrown::HashMap; +use hashbrown::HashTable; const DEFAULT_DEDUP_CAPACITY: usize = 4096; @@ -44,11 +43,7 @@ pub struct Interner { state: ahash::RandomState, /// Used to provide a lookup from value to unique value - /// - /// Note: `S::Key`'s hash implementation is not used, instead the raw entry - /// API is used to store keys w.r.t the hash of the strings themselves - /// - dedup: HashMap, + dedup: HashTable, storage: S, } @@ -58,7 +53,7 @@ impl Interner { pub fn new(storage: S) -> Self { Self { state: Default::default(), - dedup: HashMap::with_capacity_and_hasher(DEFAULT_DEDUP_CAPACITY, ()), + dedup: HashTable::with_capacity(DEFAULT_DEDUP_CAPACITY), storage, } } @@ -67,23 +62,15 @@ impl Interner { pub fn intern(&mut self, value: &S::Value) -> S::Key { let hash = self.state.hash_one(value.as_bytes()); - let entry = self + *self .dedup - .raw_entry_mut() - .from_hash(hash, |index| value == self.storage.get(*index)); - - match entry { - RawEntryMut::Occupied(entry) => *entry.into_key(), - RawEntryMut::Vacant(entry) => { - let key = self.storage.push(value); - - *entry - .insert_with_hasher(hash, key, (), |key| { - self.state.hash_one(self.storage.get(*key).as_bytes()) - }) - .0 - } - } + .entry( + hash, + |index| value == self.storage.get(*index), + |key| self.state.hash_one(self.storage.get(*key).as_bytes()), + ) + .or_insert_with(|| self.storage.push(value)) + .get() } /// Return estimate of the memory used, in bytes