Skip to content
This repository has been archived by the owner on Apr 29, 2020. It is now read-only.

Commit

Permalink
feat: adds js implementation of rabin chunker for windows and browser (
Browse files Browse the repository at this point in the history
  • Loading branch information
achingbrain authored May 24, 2019
1 parent c849359 commit 542b3e4
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 79 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"hamt-sharding": "~0.0.2",
"ipfs-unixfs": "~0.1.16",
"ipld-dag-pb": "~0.17.2",
"long": "^4.0.0",
"multicodec": "~0.5.1",
"multihashing-async": "~0.7.0",
"superstruct": "~0.6.1"
Expand Down
232 changes: 208 additions & 24 deletions src/chunker/rabin.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
'use strict'

const errCode = require('err-code')

let createRabin
const Long = require('long')
const BufferList = require('bl')
let rabin

module.exports = async function * rabinChunker (source, options) {
if (!createRabin) {
if (!rabin) {
try {
createRabin = require('rabin')

if (typeof createRabin !== 'function') {
throw errCode(new Error(`createRabin was not a function`), 'ERR_UNSUPPORTED')
}
} catch (err) {
throw errCode(new Error(`Rabin chunker not available, it may have failed to install or not be supported on this platform`), 'ERR_UNSUPPORTED')
rabin = nativeRabin()
} catch (_) {
// fallback to js implementation
rabin = jsRabin()
}
}

Expand All @@ -30,30 +28,216 @@ module.exports = async function * rabinChunker (source, options) {
}

const sizepow = Math.floor(Math.log2(avg))
const rabin = createRabin({

for await (const chunk of rabin(source, {
min: min,
max: max,
bits: sizepow,
window: options.window,
polynomial: options.polynomial
})
})) {
yield chunk
}
}

const nativeRabin = () => {
const createRabin = require('rabin')

if (typeof rabin !== 'function') {
throw errCode(new Error(`rabin was not a function`), 'ERR_UNSUPPORTED')
}

return async function * (source, options) {
const rabin = createRabin(options)

// TODO: rewrite rabin using node streams v3
for await (const chunk of source) {
rabin.buffers.append(chunk)
rabin.pending.push(chunk)

const sizes = []

rabin.rabin.fingerprint(rabin.pending, sizes)
rabin.pending = []

for (let i = 0; i < sizes.length; i++) {
const size = sizes[i]
const buf = rabin.buffers.slice(0, size)
rabin.buffers.consume(size)

yield buf
}
}

if (rabin.buffers.length) {
yield rabin.buffers.slice(0)
}
}
}

const jsRabin = () => {
// see https://github.com/datproject/rabin/blob/c0378395dc0a125ab21ac176ec504f9995b34e62/src/rabin.cc
class Rabin {
constructor (options) {
this.window = new Array(options.window || 64).fill(Long.fromInt(0))
this.wpos = 0
this.count = 0
this.digest = Long.fromInt(0)
this.chunkLength = 0
this.polynomial = options.polynomial
this.polynomialDegree = 53
this.polynomialShift = this.polynomialDegree - 8
this.averageBits = options.bits || 12
this.minSize = options.min || 8 * 1024
this.maxSize = options.max || 32 * 1024
this.mask = Long.fromInt(1).shiftLeft(this.averageBits).subtract(1)
this.modTable = []
this.outTable = []

this.calculateTables()
}

calculateTables () {
for (let i = 0; i < 256; i++) {
let hash = Long.fromInt(0, true)

hash = this.appendByte(hash, i)

for (let j = 0; j < this.window.length - 1; j++) {
hash = this.appendByte(hash, 0)
}

this.outTable[i] = hash
}

const k = this.deg(this.polynomial)

for (let i = 0; i < 256; i++) {
const b = Long.fromInt(i, true)

this.modTable[i] = b.shiftLeft(k)
.modulo(this.polynomial)
.or(b.shiftLeft(k))
}
}

deg (p) {
let mask = Long.fromString('0x8000000000000000', true, 16)

for (let i = 0; i < 64; i++) {
if (mask.and(p).greaterThan(0)) {
return Long.fromInt(63 - i)
}

mask = mask.shiftRight(1)
}

// TODO: rewrite rabin using node streams v3
for await (const chunk of source) {
rabin.buffers.append(chunk)
rabin.pending.push(chunk)
return Long.fromInt(-1)
}

appendByte (hash, b) {
hash = hash.shiftLeft(8)
hash = hash.or(b)

return hash.modulo(this.polynomial)
}

getFingerprints (bufs) {
const lengths = []

for (let i = 0; i < bufs.length; i++) {
let buf = bufs[i]

while (true) {
const remaining = this.nextChunk(buf)

if (remaining < 0) {
break
}

buf = buf.slice(remaining)

lengths.push(this.chunkLength)
}
}

return lengths
}

nextChunk (buf) {
for (let i = 0; i < buf.length; i++) {
const val = Long.fromInt(buf[i])

this.slide(val)

this.count++

if ((this.count >= this.minSize && this.digest.and(this.mask).equals(0)) || this.count >= this.maxSize) {
this.chunkLength = this.count

this.reset()

return i + 1
}
}

return -1
}

slide (value) {
const out = this.window[this.wpos].toInt() & 255
this.window[this.wpos] = value
this.digest = this.digest.xor(this.outTable[out])
this.wpos = (this.wpos + 1) % this.window.length

this.append(value)
}

reset () {
this.window = this.window.map(() => Long.fromInt(0))
this.wpos = 0
this.count = 0
this.digest = Long.fromInt(0)

const sizes = []
this.slide(Long.fromInt(1))
}

rabin.rabin.fingerprint(rabin.pending, sizes)
rabin.pending = []
append (value) {
const index = this.digest.shiftRight(this.polynomialShift).toInt() & 255
this.digest = this.digest.shiftLeft(8)
this.digest = this.digest.or(value)

for (let i = 0; i < sizes.length; i++) {
const size = sizes[i]
const buf = rabin.buffers.slice(0, size)
rabin.buffers.consume(size)
const entry = this.modTable[index]

if (entry) {
this.digest = this.digest.xor(entry)
}
}
}

return async function * (source, options) {
const r = new Rabin(options)
const buffers = new BufferList()
let pending = []

for await (const chunk of source) {
buffers.append(chunk)
pending.push(chunk)

This comment has been minimized.

Copy link
@mikeal

mikeal Jul 16, 2019

I think this is a bug left over from a prior iteration of the code, but it’s still present in the latest version of this module.

pending is being cleared on every iteration. in the newest code it’s even copied into a new single buffer using Buffer.concat(). This is all unnecessary because you’re only ever processing one chunk.

However, based on this code I’m going to guess that getFingerprints doesn’t guarantee it will return slices through the end of the bytes it is given. If that’s true, then there’s a bigger bug in this code, which is that you should be processing a concatenated version of the buffers object using buffers.slice() since the rest of the code seems to be keeping the tail of any remaining data intact.

Either getFingerprints always gives you slices through the end of the file or this code is dropping small amounts of data from the tail of each chunk and then slicing the wrong data out of buffers.


const sizes = r.getFingerprints(pending)
pending = []

for (let i = 0; i < sizes.length; i++) {
var size = sizes[i]
var buf = buffers.slice(0, size)
buffers.consume(size)

yield buf
}
}

yield buf
if (buffers.length) {
yield buffers.slice(0)
}
}
}
4 changes: 2 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ const ChunkerOptions = struct({
maxChunkSize: 'number?',
avgChunkSize: 'number?',
window: 'number?',
polynomial: 'string?'
polynomial: 'number?'
}, {
maxChunkSize: 262144,
avgChunkSize: 262144,
window: 16,
polynomial: '0x3DF305DFB2A805'
polynomial: 17437180132763653 // https://github.com/ipfs/go-ipfs-chunker/blob/d0125832512163708c0804a3cda060e21acddae4/rabin.go#L11
})

const BuilderOptions = struct({
Expand Down
6 changes: 0 additions & 6 deletions test/chunker-fixed-size.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@ const rawFile = loadFixture((isNode ? __dirname : 'test') + '/fixtures/1MiB.txt'
describe('chunker: fixed size', function () {
this.timeout(30000)

before(function () {
if (!isNode) {
this.skip()
}
})

it('chunks non flat buffers', async () => {
const b1 = Buffer.alloc(2 * 256)
const b2 = Buffer.alloc(1 * 256)
Expand Down
25 changes: 0 additions & 25 deletions test/chunker-rabin-browser.spec.js

This file was deleted.

Loading

0 comments on commit 542b3e4

Please sign in to comment.