Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add icu_provider::fxhash_32 #4028

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions provider/core/src/fxhash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

/// Computes the FxHash of a byte array.
///
/// ICU4X components can use this for a stable, compile-time hash.
///
/// # Examples
///
/// ```
/// use zerovec::zeroslice;
/// use zerovec::ule::AsULE;
///
/// let nums = zeroslice!(u32; <u32 as AsULE>::ULE::from_unsigned; [1, 2, 3, 4, 5]);
/// let hashed = icu_provider::fxhash_32(nums.as_bytes());
///
/// assert_eq!(hashed, 0xF7495CFD);
/// ```
pub const fn fxhash_32(bytes: &[u8]) -> u32 {
fxhash_32_trim(bytes, 0, 0)
}

/// Const function to compute the FxHash of a byte array.
///
/// FxHash is a speedy hash algorithm used within rustc. The algorithm is satisfactory for our
/// use case since the strings being hashed originate from a trusted source (the ICU4X
/// components), and the hashes are computed at compile time, so we can check for collisions.
///
/// We could have considered a SHA or other cryptographic hash function. However, we are using
/// FxHash because:
///
/// 1. There is precedent for this algorithm in Rust
/// 2. The algorithm is easy to implement as a const function
/// 3. The amount of code is small enough that we can reasonably keep the algorithm in-tree
/// 4. FxHash is designed to output 32-bit or 64-bit values, whereas SHA outputs more bits,
/// such that truncation would be required in order to fit into a u32, partially reducing
/// the benefit of a cryptographically secure algorithm
// The indexing operations in this function have been reviewed in detail and won't panic.
#[allow(clippy::indexing_slicing)]
pub(crate) const fn fxhash_32_trim(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this used to be called _manual_slice and that's also what the const fns in locid are called.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should also document this function's behavior

I think the comments about why fxhash should go on the module or on the other function, this function should have short documentation saying something like "runs fxhash_32 with a manual slice" and then document what it means for it to be a manual slice

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. It's not manual slice; it takes "skip start" and "skip end" instead of "start" and "limit". This function has been around for a while and this PR does not seek to change its call sites
  2. This function is not public; I just moved it. I made a new public API for it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, can you still move the docs?

bytes: &[u8],
ignore_leading: usize,
ignore_trailing: usize,
) -> u32 {
// This code is adapted from https://github.com/rust-lang/rustc-hash,
// whose license text is reproduced below.
//
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

if ignore_leading + ignore_trailing >= bytes.len() {
return 0;
}

#[inline]
const fn hash_word_32(mut hash: u32, word: u32) -> u32 {
const ROTATE: u32 = 5;
const SEED32: u32 = 0x9e_37_79_b9;
hash = hash.rotate_left(ROTATE);
hash ^= word;
hash = hash.wrapping_mul(SEED32);
hash
}

let mut cursor = ignore_leading;
let end = bytes.len() - ignore_trailing;
let mut hash = 0;

while end - cursor >= 4 {
let word = u32::from_le_bytes([
bytes[cursor],
bytes[cursor + 1],
bytes[cursor + 2],
bytes[cursor + 3],
]);
hash = hash_word_32(hash, word);
cursor += 4;
}

if end - cursor >= 2 {
let word = u16::from_le_bytes([bytes[cursor], bytes[cursor + 1]]);
hash = hash_word_32(hash, word as u32);
cursor += 2;
}

if end - cursor >= 1 {
hash = hash_word_32(hash, bytes[cursor] as u32);
}

hash
}

#[test]
fn test_hash_word_32() {
assert_eq!(0, fxhash_32_trim(b"", 0, 0));
assert_eq!(0, fxhash_32_trim(b"a", 1, 0));
assert_eq!(0, fxhash_32_trim(b"a", 0, 1));
assert_eq!(0, fxhash_32_trim(b"a", 0, 10));
assert_eq!(0, fxhash_32_trim(b"a", 10, 0));
assert_eq!(0, fxhash_32_trim(b"a", 1, 1));
assert_eq!(0xF3051F19, fxhash_32_trim(b"a", 0, 0));
assert_eq!(0x2F9DF119, fxhash_32_trim(b"ab", 0, 0));
assert_eq!(0xCB1D9396, fxhash_32_trim(b"abc", 0, 0));
assert_eq!(0x8628F119, fxhash_32_trim(b"abcd", 0, 0));
assert_eq!(0xBEBDB56D, fxhash_32_trim(b"abcde", 0, 0));
assert_eq!(0x1CE8476D, fxhash_32_trim(b"abcdef", 0, 0));
assert_eq!(0xC0F176A4, fxhash_32_trim(b"abcdefg", 0, 0));
assert_eq!(0x09AB476D, fxhash_32_trim(b"abcdefgh", 0, 0));
assert_eq!(0xB72F5D88, fxhash_32_trim(b"abcdefghi", 0, 0));
}
95 changes: 2 additions & 93 deletions provider/core/src/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use crate::error::{DataError, DataErrorKind};

use crate::fallback::{LocaleFallbackConfig, LocaleFallbackPriority, LocaleFallbackSupplement};
use crate::fxhash::fxhash_32_trim;
use alloc::borrow::Cow;
use core::fmt;
use core::fmt::Write;
Expand Down Expand Up @@ -50,7 +51,7 @@ pub struct DataKeyHash([u8; 4]);

impl DataKeyHash {
const fn compute_from_path(path: DataKeyPath) -> Self {
let hash = fxhash_32(
let hash = fxhash_32_trim(
path.tagged.as_bytes(),
leading_tag!().len(),
trailing_tag!().len(),
Expand All @@ -64,79 +65,6 @@ impl DataKeyHash {
}
}

/// Const function to compute the FxHash of a byte array.
///
/// FxHash is a speedy hash algorithm used within rustc. The algorithm is satisfactory for our
/// use case since the strings being hashed originate from a trusted source (the ICU4X
/// components), and the hashes are computed at compile time, so we can check for collisions.
///
/// We could have considered a SHA or other cryptographic hash function. However, we are using
/// FxHash because:
///
/// 1. There is precedent for this algorithm in Rust
/// 2. The algorithm is easy to implement as a const function
/// 3. The amount of code is small enough that we can reasonably keep the algorithm in-tree
/// 4. FxHash is designed to output 32-bit or 64-bit values, whereas SHA outputs more bits,
/// such that truncation would be required in order to fit into a u32, partially reducing
/// the benefit of a cryptographically secure algorithm
// The indexing operations in this function have been reviewed in detail and won't panic.
#[allow(clippy::indexing_slicing)]
const fn fxhash_32(bytes: &[u8], ignore_leading: usize, ignore_trailing: usize) -> u32 {
// This code is adapted from https://github.com/rust-lang/rustc-hash,
// whose license text is reproduced below.
//
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

if ignore_leading + ignore_trailing >= bytes.len() {
return 0;
}

#[inline]
const fn hash_word_32(mut hash: u32, word: u32) -> u32 {
const ROTATE: u32 = 5;
const SEED32: u32 = 0x9e_37_79_b9;
hash = hash.rotate_left(ROTATE);
hash ^= word;
hash = hash.wrapping_mul(SEED32);
hash
}

let mut cursor = ignore_leading;
let end = bytes.len() - ignore_trailing;
let mut hash = 0;

while end - cursor >= 4 {
let word = u32::from_le_bytes([
bytes[cursor],
bytes[cursor + 1],
bytes[cursor + 2],
bytes[cursor + 3],
]);
hash = hash_word_32(hash, word);
cursor += 4;
}

if end - cursor >= 2 {
let word = u16::from_le_bytes([bytes[cursor], bytes[cursor + 1]]);
hash = hash_word_32(hash, word as u32);
cursor += 2;
}

if end - cursor >= 1 {
hash = hash_word_32(hash, bytes[cursor] as u32);
}

hash
}

impl<'a> zerovec::maps::ZeroMapKV<'a> for DataKeyHash {
type Container = zerovec::ZeroVec<'a, DataKeyHash>;
type Slice = zerovec::ZeroSlice<DataKeyHash>;
Expand Down Expand Up @@ -672,25 +600,6 @@ fn test_key_to_string() {
}
}

#[test]
fn test_hash_word_32() {
assert_eq!(0, fxhash_32(b"", 0, 0));
assert_eq!(0, fxhash_32(b"a", 1, 0));
assert_eq!(0, fxhash_32(b"a", 0, 1));
assert_eq!(0, fxhash_32(b"a", 0, 10));
assert_eq!(0, fxhash_32(b"a", 10, 0));
assert_eq!(0, fxhash_32(b"a", 1, 1));
assert_eq!(0xF3051F19, fxhash_32(b"a", 0, 0));
assert_eq!(0x2F9DF119, fxhash_32(b"ab", 0, 0));
assert_eq!(0xCB1D9396, fxhash_32(b"abc", 0, 0));
assert_eq!(0x8628F119, fxhash_32(b"abcd", 0, 0));
assert_eq!(0xBEBDB56D, fxhash_32(b"abcde", 0, 0));
assert_eq!(0x1CE8476D, fxhash_32(b"abcdef", 0, 0));
assert_eq!(0xC0F176A4, fxhash_32(b"abcdefg", 0, 0));
assert_eq!(0x09AB476D, fxhash_32(b"abcdefgh", 0, 0));
assert_eq!(0xB72F5D88, fxhash_32(b"abcdefghi", 0, 0));
}

#[test]
fn test_key_hash() {
struct KeyTestCase {
Expand Down
2 changes: 2 additions & 0 deletions provider/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ mod data_provider;
mod error;
#[doc(hidden)]
pub mod fallback;
mod fxhash;
mod key;
mod request;
mod response;
Expand All @@ -157,6 +158,7 @@ pub use crate::data_provider::DataProvider;
pub use crate::data_provider::DynamicDataProvider;
pub use crate::error::DataError;
pub use crate::error::DataErrorKind;
pub use crate::fxhash::fxhash_32;
pub use crate::key::DataKey;
pub use crate::key::DataKeyHash;
pub use crate::key::DataKeyMetadata;
Expand Down