Skip to content

Commit

Permalink
Merge pull request #181 from dynatrace-oss/improved-consistent-weight…
Browse files Browse the repository at this point in the history
…ed-sampling

Improved consistent weighted sampling
  • Loading branch information
oertl authored Nov 24, 2023
2 parents 331804a + 6cdb6ea commit b3e8d1b
Show file tree
Hide file tree
Showing 12 changed files with 551 additions and 107 deletions.
23 changes: 16 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ To add a dependency on hash4j using Maven, use the following:
<dependency>
<groupId>com.dynatrace.hash4j</groupId>
<artifactId>hash4j</artifactId>
<version>0.13.0</version>
<version>0.14.0</version>
</dependency>
```
To add a dependency using Gradle:
```gradle
implementation 'com.dynatrace.hash4j:hash4j:0.13.0'
implementation 'com.dynatrace.hash4j:hash4j:0.14.0'
```

## Hash algorithms
Expand Down Expand Up @@ -193,11 +193,20 @@ HashValue128 hash = FileHashing.imohash1_0_2().hashFileTo128Bits(file);
See also [FileHashingDemo.java](src/test/java/com/dynatrace/hash4j/file/FileHashingDemo.java).

## Consistent hashing
This library contains an implementation of [JumpHash](https://arxiv.org/abs/1406.2294)
that can be used to achieve distributed agreement when assigning hash values to a given number of buckets.
The hash values are distributed uniformly over the buckets.
The algorithm also minimizes the number of reassignments needed for balancing when the number of buckets changes.

This library contains various algorithms for the distributed agreement on the assignment of hash values to a given number of buckets.
In the naive approach, the hash values are assigned to the buckets with the modulo operation according to
`bucketIdx = abs(hash) % numBuckets`.
If the number of buckets is changed, the bucket index will change for most hash values.
With a consistent hash algorithm, the above expression can be replaced by
`bucketIdx = consistentBucketHasher.getBucket(hash, numBuckets)`
to minimize the number of reassignments while still ensuring a fair distribution across all buckets.

The following consistent hashing algorithms are available:
* [JumpHash](https://arxiv.org/abs/1406.2294): This algorithm has a calculation time that scales logarithmically with the number of buckets
* [Improved Consistent Weighted Sampling](https://doi.org/10.1109/ICDM.2010.80): This algorithm is based on improved
consistent weighted sampling with a constant computation time independent of the number of buckets. This algorithm is faster than
JumpHash for large numbers of buckets.

### Usage
```java
// create a consistent bucket hasher
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ java {
}

group = 'com.dynatrace.hash4j'
version = '0.13.0'
version = '0.14.0'

spotless {
ratchetFrom 'origin/main'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright 2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.consistent;

import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
import java.util.SplittableRandom;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

public class ConsistentJumpBucketHasherPerformanceTest {

private static final ConsistentBucketHasher CONSISTENT_BUCKET_HASHER =
ConsistentHashing.jumpHash(PseudoRandomGeneratorProvider.splitMix64_V1());

@State(Scope.Thread)
public static class TestState {

@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
int numBuckets;

SplittableRandom random;

@Setup
public void init() {
random = new SplittableRandom(0x87c5950e6677341eL);
}
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void getBucket(TestState testState, Blackhole blackhole) {
int bucket =
CONSISTENT_BUCKET_HASHER.getBucket(testState.random.nextLong(), testState.numBuckets);
blackhole.consume(bucket);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright 2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.consistent;

import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
import java.util.SplittableRandom;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

public class ImprovedConsistentWeightedSamplingPerformanceTest {

private static final ConsistentBucketHasher CONSISTENT_BUCKET_HASHER =
ConsistentHashing.improvedConsistentWeightedSampling(
PseudoRandomGeneratorProvider.splitMix64_V1());

@State(Scope.Thread)
public static class TestState {

@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
int numBuckets;

SplittableRandom random;

@Setup
public void init() {
random = new SplittableRandom(0x87c5950e6677341eL);
}
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void getBucket(TestState testState, Blackhole blackhole) {
int bucket =
CONSISTENT_BUCKET_HASHER.getBucket(testState.random.nextLong(), testState.numBuckets);
blackhole.consume(bucket);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright 2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.consistent;

import java.util.SplittableRandom;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

public class ModuloPerformanceTest {

@State(Scope.Thread)
public static class TestState {

@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
int numBuckets;

SplittableRandom random;

@Setup
public void init() {
random = new SplittableRandom(0x87c5950e6677341eL);
}
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void getBucket(TestState testState, Blackhole blackhole) {
int bucket = (int) ((testState.random.nextLong() & 0x7FFFFFFFFFFFFFFFL) % testState.numBuckets);
blackhole.consume(bucket);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.consistent;

import java.util.SplittableRandom;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

public class RandomNumberPerformanceTest {

@State(Scope.Thread)
public static class TestState {

SplittableRandom random;

@Setup
public void init() {
random = new SplittableRandom(0x87c5950e6677341eL);
}
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void getBucket(TestState testState, Blackhole blackhole) {
blackhole.consume(testState.random.nextLong());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,32 @@ private ConsistentHashing() {}
* consistent hash algorithm." arXiv preprint <a
* href="https://arxiv.org/abs/1406.2294">arXiv:1406.2294</a> (2014).
*
* <p>The average computation time depends logarithmically on the number of buckets.
*
* @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider}
* @return a {@link ConsistentBucketHasher}
*/
public static ConsistentBucketHasher jumpHash(
PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
return new ConsistentJumpBucketHasher(pseudoRandomGeneratorProvider);
}

/**
* Returns a {@link ConsistentBucketHasher}.
*
* <p>This algorithm is based on the method described in Sergey Ioffe, "Improved Consistent
* Sampling, Weighted Minhash and L1 Sketching," 2010, doi: <a
* href="https://doi.org/10.1109/ICDM.2010.80">10.1109/ICDM.2010.80.</a> which is applied to a
* one-dimensional input vector whose value is equal to the number of buckets.
*
* <p>The computation time is constant independent of the number of buckets. This method is faster
* than {@link #jumpHash(PseudoRandomGeneratorProvider)} for large number of buckets.
*
* @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider}
* @return a {@link ConsistentBucketHasher}
*/
public static ConsistentBucketHasher improvedConsistentWeightedSampling(
PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
return new ImprovedConsistentWeightedSampling(pseudoRandomGeneratorProvider);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class ConsistentJumpBucketHasher implements ConsistentBucketHasher {
// see
// https://github.com/google/guava/blob/0a17f4a429323589396c38d8ce75ca058faa6c64/guava/src/com/google/common/hash/Hashing.java#L559
@Override
public int getBucket(long hash, int numBuckets) {
public strictfp int getBucket(long hash, int numBuckets) {
checkArgument(numBuckets > 0, "buckets must be positive");
pseudoRandomGenerator.reset(hash);

Expand All @@ -64,11 +64,10 @@ public int getBucket(long hash, int numBuckets) {
// Jump from bucket to bucket until we go out of range
while (true) {
next = (int) ((candidate + 1) / pseudoRandomGenerator.nextDouble());
if (next > candidate && next < numBuckets) {
candidate = next;
} else {
return candidate;
}
if (next >= numBuckets || next <= candidate)
return candidate; // second condition protects against infinite loops caused by bad random
// values such as NaN or values outside of [0,1)
candidate = next;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright 2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.consistent;

import static com.dynatrace.hash4j.util.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;

import com.dynatrace.hash4j.random.PseudoRandomGenerator;
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;

/**
* Consistent hashing algorithm based on a simplified version of the algorithm described in Sergey
* Ioffe, <a href="https://ieeexplore.ieee.org/abstract/document/5693978">"Improved Consistent
* Sampling, Weighted Minhash and L1 Sketching,"</a> 2010 IEEE International Conference on Data
* Mining, Sydney, NSW, Australia, 2010, pp. 246-255, doi: 10.1109/ICDM.2010.80.
*/
class ImprovedConsistentWeightedSampling implements ConsistentBucketHasher {

private final PseudoRandomGenerator pseudoRandomGenerator;

ImprovedConsistentWeightedSampling(PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
requireNonNull(pseudoRandomGeneratorProvider);
this.pseudoRandomGenerator = pseudoRandomGeneratorProvider.create();
}

@Override
public strictfp int getBucket(long hash, int numBuckets) {
checkArgument(numBuckets > 0, "buckets must be positive");
pseudoRandomGenerator.reset(hash);
double r = pseudoRandomGenerator.nextExponential() + pseudoRandomGenerator.nextExponential();
double b = pseudoRandomGenerator.nextDouble();
double t = StrictMath.floor(StrictMath.log(numBuckets) / r + b);
double y = StrictMath.exp(r * (t - b));
// y should always be in the range [0, numBuckets),
// but could be larger due to numerical inaccuracies,
// therefore limit result after rounding down to numBuckets - 1
return Math.min((int) y, numBuckets - 1);
}
}
Loading

0 comments on commit b3e8d1b

Please sign in to comment.