-
Notifications
You must be signed in to change notification settings - Fork 3
/
SimHash.java
67 lines (60 loc) · 1.27 KB
/
SimHash.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
public class SimHash {
/**
* Generate 64 bit simhash for a string
*
* @param s
* @return
*/
public static long simHash64(String s) {
long result = 0;
int[] bitVector = new int[64];
String[] words = s.split("[\\s()\\-\\/]+");
for (String word : words) {
if (word.isEmpty()) {
continue;
}
long hash = fvnHash64(word);
for (int i = 0; i < bitVector.length; i++) {
bitVector[i] += (hash & 1) == 1 ? 1 : -1;
hash = hash >> 1;
}
}
for (int i = 0; i < bitVector.length; i++) {
result = result << 1;
if (bitVector[i] > 0) {
result += 1;
}
}
return result;
}
/**
* Count different bits between two numbers
*
* @param a
* @param b
* @return
*/
public static int hammingDistance(long a, long b) {
int dist = 0;
a = a ^ b;
while (a != 0) {
a &= a - 1;
dist++;
}
return dist;
}
/**
* Generate 64 bit FVN hash for a string
* @param s
* @return
*/
public static long fvnHash64(String s) {
long basis = 0xcbf29ce484222325L;
long prime = 0x100000001b3L;
for (int i = 0; i < s.length(); i++) {
basis ^= s.charAt(i);
basis *= prime;
}
return basis;
}
}