Scalable MinHash computation
- node 8 or higher
npm i @5app/lsh
- Customise the base class for your dataset
const Lsh = require('@5app/lsh')
const B = 10;
const R = 5;
class MyDataLsh extends Lsh {
constructor (bands = B, height = R) { // set default permutation params
super(bands, height)
}
async getColumnIdSlice ({ cursorId, size, ...custom }) {
// return a number {size} of ids from cursorId
}
async getRowIdSlice ({ cursorId, size, ...custom }) {
// return a number {size} of ids from cursorId
}
async getRowCount ({...custom }) {
// return total numbers of rows
}
async getShingles ({ columnIds, rowIds, ...custom }) {
// return Shingles for specified columns and rows
}
async store ({ index, buckets, data, ...custom }) {
// store a batch of minhashes and bucket info
// use data object to store in memory
}
async finalise ({ blocks, columns, rows, stamp, data }) {
// ... finalise info lsh storage
// return report object
}
static get limit () {
// return permutation limit
}
static signature(value, index) {
// return stringified value
}
static ignore (bucketId) {
// return whether this bucket is null
}
static format (bucketId, index) {
// return formated bucketId to append to minhash
}
}
module.exports = MyDataLsh
- Compute and compare your minhashes
const MyDataLsh = require('./myDataLsh')
const { compare, getItemMinHash } = require('./myMethods')
const myDataLsh = new MyDataLsh(10, 10)
// ...
// compute and store your items minhash
const size = 25 // size of blocks to be computed
const report = await myDataLsh.run(custom, size)
// ...
// compare your items minhash
const [ minHashA, minHashB ] = await Promise.all([
getItemMinHash(itemA.id),
getItemMinHash(itemB.id)
])
const similarity = compare(minHashA, minHashB)
// ...
npm test
We use SemVer for versioning. For the versions available, see the tags on this repository.