Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions common/sketch/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
</dependency>
</dependencies>

<build>
Expand Down
116 changes: 0 additions & 116 deletions common/sketch/src/main/java/org/apache/spark/util/sketch/BitArray.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public enum Version {
* <li>The words/longs (numWords * 64 bit)</li>
* </ul>
*/
V1(1);
V1(2);

private final int versionNumber;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ class BloomFilterImpl extends BloomFilter implements Serializable {

private int numHashFunctions;

private BitArray bits;
private RoaringBitmapArray bits;

BloomFilterImpl(int numHashFunctions, long numBits) {
this(new BitArray(numBits), numHashFunctions);
this(new RoaringBitmapArray(numBits), numHashFunctions);
}

private BloomFilterImpl(BitArray bits, int numHashFunctions) {
private BloomFilterImpl(RoaringBitmapArray bits, int numHashFunctions) {
this.bits = bits;
this.numHashFunctions = numHashFunctions;
}
Expand All @@ -48,7 +48,7 @@ public boolean equals(Object other) {

BloomFilterImpl that = (BloomFilterImpl) other;

return this.numHashFunctions == that.numHashFunctions && this.bits.equals(that.bits);
return (this.numHashFunctions == that.numHashFunctions) && this.bits.equals(that.bits);
}

@Override
Expand Down Expand Up @@ -84,18 +84,19 @@ public boolean putString(String item) {

@Override
public boolean putBinary(byte[] item) {
int h1 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, 0);
int h2 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, h1);
// Strategy is taken from guava`s BloomFilterStrategies.MURMUR128_MITZ_64
long[] hashes = new long[2];
Murmur3_128.hashBytes(item, 0, hashes);
long h1 = hashes[0];
long h2 = hashes[1];

long bitSize = bits.bitSize();
boolean bitsChanged = false;
long combinedHash = h1;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = h1 + (i * h2);
// Flip all the bits if it's negative (guaranteed positive number)
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
bitsChanged |= bits.set(combinedHash % bitSize);
// Make combinedHash positive and indexable
bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
combinedHash += h2;
}
return bitsChanged;
}
Expand All @@ -107,61 +108,59 @@ public boolean mightContainString(String item) {

@Override
public boolean mightContainBinary(byte[] item) {
int h1 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, 0);
int h2 = Murmur3_x86_32.hashUnsafeBytes(item, Platform.BYTE_ARRAY_OFFSET, item.length, h1);
// Strategy is taken from guava`s BloomFilterStrategies.MURMUR128_MITZ_64
long[] hashes = new long[2];
Murmur3_128.hashBytes(item, 0, hashes);

long h1 = hashes[0];
long h2 = hashes[1];

long bitSize = bits.bitSize();
long combinedHash = h1;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = h1 + (i * h2);
// Flip all the bits if it's negative (guaranteed positive number)
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
if (!bits.get(combinedHash % bitSize)) {
// Make combinedHash positive and indexable
if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
return false;
}
combinedHash += h2;
}
return true;
}

@Override
public boolean putLong(long item) {
// Here we first hash the input long element into 2 int hash values, h1 and h2, then produce n
// hash values by `h1 + i * h2` with 1 <= i <= numHashFunctions.
// Note that `CountMinSketch` use a different strategy, it hash the input long element with
// every i to produce n hash values.
// TODO: the strategy of `CountMinSketch` looks more advanced, should we follow it here?
int h1 = Murmur3_x86_32.hashLong(item, 0);
int h2 = Murmur3_x86_32.hashLong(item, h1);
// Strategy is taken from guava`s BloomFilterStrategies.MURMUR128_MITZ_64
long[] hashes = new long[2];
Murmur3_128.hashLong(item, 0, hashes);
long h1 = hashes[0];
long h2 = hashes[1];

long bitSize = bits.bitSize();
boolean bitsChanged = false;
long combinedHash = h1;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = h1 + (i * h2);
// Flip all the bits if it's negative (guaranteed positive number)
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
bitsChanged |= bits.set(combinedHash % bitSize);
// Make combinedHash positive and indexable
bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
combinedHash += h2;
}
return bitsChanged;
}

@Override
public boolean mightContainLong(long item) {
int h1 = Murmur3_x86_32.hashLong(item, 0);
int h2 = Murmur3_x86_32.hashLong(item, h1);
// Strategy is taken from guava`s BloomFilterStrategies.MURMUR128_MITZ_64
long[] hashes = new long[2];
Murmur3_128.hashLong(item, 0, hashes);
long h1 = hashes[0];
long h2 = hashes[1];

long bitSize = bits.bitSize();
long combinedHash = h1;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = h1 + (i * h2);
// Flip all the bits if it's negative (guaranteed positive number)
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
if (!bits.get(combinedHash % bitSize)) {
if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
return false;
}
combinedHash += h2;
}
return true;
}
Expand Down Expand Up @@ -238,7 +237,7 @@ private void readFrom0(InputStream in) throws IOException {
}

this.numHashFunctions = dis.readInt();
this.bits = BitArray.readFrom(dis);
this.bits = RoaringBitmapArray.readFrom(dis);
}

public static BloomFilterImpl readFrom(InputStream in) throws IOException {
Expand Down
Loading