Skip to content

Commit

Permalink
Use HashTable instead of raw_entry_mut (#6537)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Oct 10, 2024
1 parent 5508978 commit 2bd7890
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 57 deletions.
55 changes: 22 additions & 33 deletions arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, Gen
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use hashbrown::HashTable;
use std::any::Any;
use std::sync::Arc;

Expand All @@ -37,12 +36,7 @@ where
T: ByteArrayType,
{
state: ahash::RandomState,
/// Used to provide a lookup from string value to key type
///
/// Note: usize's hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
dedup: HashMap<usize, (), ()>,
dedup: HashTable<usize>,

keys_builder: PrimitiveBuilder<K>,
values_builder: GenericByteBuilder<T>,
Expand All @@ -69,7 +63,7 @@ where
let values_builder = GenericByteBuilder::<T>::new();
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
dedup: HashTable::with_capacity(keys_builder.capacity()),
keys_builder,
values_builder,
}
Expand Down Expand Up @@ -123,7 +117,7 @@ where
let state = ahash::RandomState::default();
let dict_len = dictionary_values.len();

let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
let mut dedup = HashTable::with_capacity(dict_len);

let values_len = dictionary_values.value_data().len();
let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
Expand All @@ -137,15 +131,13 @@ where
let value_bytes: &[u8] = value.as_ref();
let hash = state.hash_one(value_bytes);

let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
value_bytes == get_bytes(&values_builder, *idx)
});

if let RawEntryMut::Vacant(v) = entry {
v.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(&values_builder, *idx))
});
}
dedup
.entry(
hash,
|idx: &usize| value_bytes == get_bytes(&values_builder, *idx),
|idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)),
)
.or_insert(idx);

values_builder.append_value(value);
}
Expand Down Expand Up @@ -216,24 +208,21 @@ where
let storage = &mut self.values_builder;
let hash = state.hash_one(value_bytes);

let entry = self
let idx = *self
.dedup
.raw_entry_mut()
.from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));

let key = match entry {
RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
RawEntryMut::Vacant(entry) => {
.entry(
hash,
|idx| value_bytes == get_bytes(storage, *idx),
|idx| state.hash_one(get_bytes(storage, *idx)),
)
.or_insert_with(|| {
let idx = storage.len();
storage.append_value(value);
idx
})
.get();

entry.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(storage, *idx))
});

K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?
}
};
let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
self.keys_builder.append_value(key);

Ok(key)
Expand Down
35 changes: 11 additions & 24 deletions parquet/src/util/interner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
// under the License.

use crate::data_type::AsBytes;
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use hashbrown::HashTable;

const DEFAULT_DEDUP_CAPACITY: usize = 4096;

Expand All @@ -44,11 +43,7 @@ pub struct Interner<S: Storage> {
state: ahash::RandomState,

/// Used to provide a lookup from value to unique value
///
/// Note: `S::Key`'s hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
dedup: HashMap<S::Key, (), ()>,
dedup: HashTable<S::Key>,

storage: S,
}
Expand All @@ -58,7 +53,7 @@ impl<S: Storage> Interner<S> {
pub fn new(storage: S) -> Self {
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(DEFAULT_DEDUP_CAPACITY, ()),
dedup: HashTable::with_capacity(DEFAULT_DEDUP_CAPACITY),
storage,
}
}
Expand All @@ -67,23 +62,15 @@ impl<S: Storage> Interner<S> {
pub fn intern(&mut self, value: &S::Value) -> S::Key {
let hash = self.state.hash_one(value.as_bytes());

let entry = self
*self
.dedup
.raw_entry_mut()
.from_hash(hash, |index| value == self.storage.get(*index));

match entry {
RawEntryMut::Occupied(entry) => *entry.into_key(),
RawEntryMut::Vacant(entry) => {
let key = self.storage.push(value);

*entry
.insert_with_hasher(hash, key, (), |key| {
self.state.hash_one(self.storage.get(*key).as_bytes())
})
.0
}
}
.entry(
hash,
|index| value == self.storage.get(*index),
|key| self.state.hash_one(self.storage.get(*key).as_bytes()),
)
.or_insert_with(|| self.storage.push(value))
.get()
}

/// Return estimate of the memory used, in bytes
Expand Down

0 comments on commit 2bd7890

Please sign in to comment.