diff --git a/package.json b/package.json index fe8e9d3..87a1336 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "dependencies": { "@ipld/dag-pb": "^2.1.15", "@web-std/stream": "1.0.1", + "rabin-rs": "^2.0.0", "rabin-wasm": "^0.1.5", "actor": "^2.2.1", "protobufjs": "^6.11.2" diff --git a/src/file.js b/src/file.js index 8f75595..93e2196 100644 --- a/src/file.js +++ b/src/file.js @@ -24,6 +24,16 @@ export const defaults = () => ({ createCID: CID.createV1, }) +/** + * + * @param {Partial} config + * @returns {API.FileWriterConfig} + */ +export const configure = config => ({ + ...defaults(), + ...config, +}) + export const UnixFSLeaf = { code: UnixFS.code, name: UnixFS.name, diff --git a/src/file/api.ts b/src/file/api.ts index 83c0195..ad287ad 100644 --- a/src/file/api.ts +++ b/src/file/api.ts @@ -13,7 +13,7 @@ export * from "../writer/api.js" import * as ChunkerService from "./chunker.js" import * as LayoutService from "./layout.js" -export type { Chunker, Layout, MultihashHasher, MultihashDigest } +export type { Chunker, Layout, MultihashHasher, MultihashDigest, Block } export interface FileWriterService extends FileWriterConfig { diff --git a/src/file/chunker.js b/src/file/chunker.js index 49edbe7..e9488b7 100644 --- a/src/file/chunker.js +++ b/src/file/chunker.js @@ -1,6 +1,7 @@ /* eslint-disable no-nested-ternary */ /* eslint-disable no-unused-vars */ import * as Chunker from "./chunker/api.js" +import * as BufferQueue from "./chunker/buffer.js" import { unreachable, EMPTY, EMPTY_BUFFER } from "../writer/util.js" export * from "./chunker/api.js" @@ -12,15 +13,17 @@ export * from "./chunker/api.js" * @typedef {{ * status: 'none' * config: Config - * buffer: Uint8Array + * buffer: Chunker.Buffer + * byteOffset: number * }} EmptyState * Represents empty state where no chunks have been found yet. * * @typedef {{ * status: 'single' + * byteOffset: number * config: Config - * buffer: Uint8Array - * chunk: Uint8Array + * buffer: Chunker.Buffer + * chunk: Chunker.Buffer * }} SingleChunkState * Represents state where single chunk have been found. In this * state it is not yet clear which file layout can be used, because @@ -28,8 +31,9 @@ export * from "./chunker/api.js" * * @typedef {{ * status: 'multiple' + * byteOffset: number * config: Config - * buffer: Uint8Array + * buffer: Chunker.Buffer * }} MultiChunkState * Represents state where more than one chunks have been found. * @@ -37,7 +41,7 @@ export * from "./chunker/api.js" * * @typedef {{ * state: State - * chunks: Uint8Array[] + * chunks: Chunker.Buffer[] * }} Update */ @@ -47,8 +51,9 @@ export * from "./chunker/api.js" */ export const open = config => ({ config, + byteOffset: 0, status: "none", - buffer: EMPTY_BUFFER, + buffer: BufferQueue.empty(), }) /** @@ -68,39 +73,53 @@ export const chunks = update => update.chunks */ export const append = (state, bytes) => { const { config } = state - const { buffer, chunks } = split(concat(state.buffer, bytes), config.chunker) + const byteOffset = state.byteOffset + bytes.byteLength + const { buffer, chunks } = split( + state.buffer.push(bytes), + // concat(state.buffer, bytes), + config.chunker, + state.byteOffset, + false + ) + switch (state.status) { case "none": switch (chunks.length) { case 0: return { - state: { ...state, buffer }, + state: { ...state, byteOffset, buffer }, chunks: EMPTY, } case 1: return { - state: { ...state, status: "single", buffer, chunk: chunks[0] }, + state: { + ...state, + status: "single", + byteOffset, + buffer, + chunk: chunks[0], + }, chunks: EMPTY, } default: return { - state: { ...state, status: "multiple", buffer }, + state: { ...state, status: "multiple", byteOffset, buffer }, chunks, } } case "single": if (chunks.length === 0) { - return { state: { ...state, buffer }, chunks: EMPTY } + return { state: { ...state, buffer, byteOffset }, chunks: EMPTY } } else { const { chunk, ...rest } = state return { - state: { ...rest, status: "multiple", buffer }, + state: { ...rest, status: "multiple", byteOffset, buffer }, chunks: [chunk, ...chunks], } } case "multiple": return { - state: { ...state, buffer }, + state: { ...state, byteOffset, buffer }, chunks, } default: @@ -110,30 +129,28 @@ export const append = (state, bytes) => { /** * @param {State} state - * @returns {{single: true, chunk:Uint8Array}|{single: false, chunks: Uint8Array[]}} + * @returns {{single: true, chunk:Chunker.Buffer}|{single: false, chunks: Chunker.Buffer[]}} */ export const close = state => { - const { buffer } = state + const { buffer, config } = state + // flush remaining bytes in the buffer + const { chunks } = split(buffer, config.chunker, state.byteOffset, true) + switch (state.status) { - case "none": - return { - single: true, - chunk: buffer, - } + case "none": { + return chunks.length === 1 + ? { single: true, chunk: chunks[0] } + : { single: false, chunks } + } case "single": { - if (buffer.byteLength > 0) { - return { - single: false, - chunks: [state.chunk, buffer], - } - } else { - return { single: true, chunk: state.chunk } - } + return chunks.length === 0 + ? { single: true, chunk: state.chunk } + : { single: false, chunks: [state.chunk, ...chunks] } } case "multiple": { return { single: false, - chunks: buffer.byteLength > 0 ? [buffer] : EMPTY, + chunks, } } default: @@ -143,22 +160,37 @@ export const close = state => { /** * @param {Chunker.Chunker} chunker - * @param {Uint8Array} input - * @returns {{buffer:Uint8Array, chunks:Uint8Array[]}} + * @param {Chunker.Buffer} input + * @param {number} byteOffset + * @param {boolean} end + * @returns {{buffer:Chunker.Buffer, chunks:Chunker.Buffer[]}} */ -export const split = (input, chunker) => { +export const split = (input, chunker, byteOffset, end) => { let buffer = input - /** @type {Uint8Array[]} */ + /** @type {Chunker.Buffer[]} */ const chunks = [] - const sizes = chunker.cut(chunker.context, buffer) + const sizes = + chunker.type === "Stateful" + ? chunker.cut(chunker.context, buffer.subarray(byteOffset), end) + : chunker.cut(chunker.context, buffer, end) + let offset = 0 for (const size of sizes) { - const chunk = buffer.subarray(offset, offset + size) - chunks.push(chunk) - offset += size + // We may be splitting empty buffer in which case there will be no chunks + // in it so we make sure that we do not emit empty buffer. + if (size > 0) { + const chunk = buffer.subarray(offset, offset + size) + chunks.push(chunk) + offset += size + } } - buffer = buffer.subarray(offset) + buffer = + offset === buffer.byteLength + ? BufferQueue.empty() + : offset === 0 + ? buffer + : buffer.subarray(offset) return { buffer, chunks } } diff --git a/src/file/chunker/api.ts b/src/file/chunker/api.ts index 633e7b7..d3c0cff 100644 --- a/src/file/chunker/api.ts +++ b/src/file/chunker/api.ts @@ -1,7 +1,17 @@ +export interface Buffer { + readonly length: number + readonly byteLength: number + slice(start?: number, end?: number): Buffer + subarray(start?: number, end?: number): Buffer + push(bytes: Uint8Array): Buffer + get(offset: number): number | undefined + copyTo(target: Uint8Array, offset: number): Uint8Array +} + /** * Chunker API can be used to slice up the file content according * to specific logic. It is designed with following properties in mind: - * + * * 1. **Stateless** - Chunker does not retain any state across the calls. This * implies that calling `cut` function on the same bytes would produce same * array of sizes. Do note however, that when chunker is used to chunk stream @@ -31,9 +41,10 @@ export interface ChunkerAPI { * buffer contains no valid chunks. * * **Note:** Consumer of the chunker is responsible for dealing with remaining - * bytes in the buffer when end of the stream is reached. + * bytes in the buffer, that is unless `end` is true signalling chunker that + * end of the stream is reached. */ - cut(context:T, buffer:Uint8Array):number[] + cut(context: T, buffer: Buffer, end?: boolean): Iterable } /** @@ -43,13 +54,12 @@ export interface ChunkerAPI { * previously seen bytes. */ export interface StatefulChunker extends ChunkerAPI { - type: 'Stateful' + type: "Stateful" } -export interface StatelessChunker> extends ChunkerAPI { - type: 'Stateless' +export interface StatelessChunker> + extends ChunkerAPI { + type: "Stateless" } -export type Chunker = - | StatefulChunker - | StatelessChunker +export type Chunker = StatefulChunker | StatelessChunker diff --git a/src/file/chunker/buffer.js b/src/file/chunker/buffer.js new file mode 100644 index 0000000..90345d8 --- /dev/null +++ b/src/file/chunker/buffer.js @@ -0,0 +1,199 @@ +import { Indexed } from "./indexed.js" + +/** + * @extends {Indexed} + */ +class BufferView extends Indexed { + /** + * @param {Uint8Array[]} parts + * @param {number} byteOffset + * @param {number} byteLength + */ + constructor(parts = [], byteOffset = 0, byteLength = 0) { + super() + this.parts = parts + this.byteLength = byteLength + this.byteOffset = byteOffset + } + + get length() { + return this.byteLength + } + *[Symbol.iterator]() { + for (const part of this.parts) { + yield* part + } + } + + /** + * @param {number} start + * @param {number} end + */ + slice(start, end) { + return slice(this, start, end) + } + + /** + * @param {number} start + * @param {number} end + */ + subarray(start, end) { + return slice(this, start, end) + } + + /** + * + * @param {Uint8Array} bytes + */ + push(bytes) { + return push(this, bytes) + } + + /** + * @param {number} n + */ + get(n) { + return get(this, n) + } + + /** + * + * @param {Uint8Array} target + * @param {number} offset + */ + copyTo(target, offset) { + for (const part of this.parts) { + target.set(part, offset) + offset += part.byteLength + } + return target + } +} + +/** @typedef {BufferView} View */ + +/** + * + * @param {BufferView} buffer + * @param {Uint8Array} part + * @returns {BufferView} + */ + +export const push = (buffer, part) => { + if (part.byteLength > 0) { + // We mutate array here but previous buffer is still a view over + // the same data. + buffer.parts.push(part) + return new BufferView( + buffer.parts, + buffer.byteOffset, + buffer.byteLength + part.byteLength + ) + } else { + return buffer + } +} + +/** + * @param {BufferView} buffer + * @param {number} n + */ +export const get = (buffer, n) => { + let offset = 0 + if (n > buffer.byteLength) { + return undefined + } + + for (const part of buffer.parts) { + if (n < offset + part.byteLength) { + return part[n - offset] + } else { + offset += part.byteLength + } + } + return undefined +} + +/** + * @param {BufferView} buffer + * @param {number} n + * @returns {readonly [number, number]} + */ + +const cursor = ({ parts, length }, n) => { + if (n === 0) { + return HEAD + } + + const count = parts.length + let offest = 0 + let index = 0 + while (index < count) { + const part = parts[index] + const nextOffset = offest + part.length + if (n < nextOffset || index === count - 1) { + break + } + offest = nextOffset + index++ + } + + return [index, n - offest] +} + +const HEAD = /** @type {[number, number]} */ (Object.freeze([0, 0])) + +/** + * + * @param {BufferView} buffer + * @param {number} [startOffset] + * @param {number} [endOffset] + * @returns {BufferView} + */ +export const slice = ( + buffer, + startOffset = buffer.byteOffset, + endOffset = buffer.byteLength +) => { + const parts = [] + const start = startOffset < 0 ? buffer.byteLength - startOffset : startOffset + const end = endOffset < 0 ? buffer.byteLength - endOffset : endOffset + + // Empty range + if (start > end || start > buffer.byteLength || end <= 0) { + return new BufferView() + } + + let byteLength = 0 + let offset = 0 + for (const part of buffer.parts) { + const nextOffset = offset + part.byteLength + // Have not found a start yet + if (byteLength === 0) { + if (end <= nextOffset) { + const slice = part.subarray(start - offset, end - offset) + return new BufferView([slice], 0, slice.byteLength) + } else if (start < nextOffset) { + const slice = start === offset ? part : part.subarray(start - offset) + byteLength = slice.byteLength + parts.push(slice) + } + } + // If end offest is in this range + else if (end <= nextOffset) { + const slice = end === nextOffset ? part : part.subarray(0, end - offset) + byteLength += slice.byteLength + parts.push(slice) + return new BufferView(parts, 0, byteLength) + } else { + parts.push(part) + byteLength += part.byteLength + } + + offset = nextOffset + } + + throw new Error("This code should be unreachable") +} + +export const empty = () => new BufferView() diff --git a/src/file/chunker/fixed.js b/src/file/chunker/fixed.js index 6c34eb5..7cc8003 100644 --- a/src/file/chunker/fixed.js +++ b/src/file/chunker/fixed.js @@ -25,11 +25,17 @@ export const withMaxChunkSize = maxChunkSize => ({ /** * @param {FixedSize} maxChunkSize - * @param {Uint8Array} buffer + * @param {API.Buffer} buffer + * @param {boolean} end * @returns {number[]} */ -export const cut = ({ maxChunkSize }, { byteLength }) => { +export const cut = ({ maxChunkSize }, { byteLength }, end) => { // number of fixed size chunks that would fit const n = (byteLength / maxChunkSize) | 0 - return new Array(n).fill(maxChunkSize) + const chunks = new Array(n).fill(maxChunkSize) + const lastChunkSize = end ? byteLength - n * maxChunkSize : 0 + if (lastChunkSize > 0) { + chunks.push(lastChunkSize) + } + return chunks } diff --git a/src/file/chunker/indexed.js b/src/file/chunker/indexed.js new file mode 100644 index 0000000..cdb44c1 --- /dev/null +++ b/src/file/chunker/indexed.js @@ -0,0 +1,20 @@ +function Indexed() {} + +Object.defineProperties(Indexed, { + prototype: { + value: new Proxy(Object.prototype, { + /** + * @param {object} target + * @param {PropertyKey} property + * @param {{get(key:PropertyKey): any}} receiver + */ + get(target, property, receiver) { + return typeof property === "symbol" + ? Reflect.get(target, property, receiver) + : receiver.get(property) + }, + }), + }, +}) + +export { Indexed } diff --git a/src/file/chunker/indexed.ts b/src/file/chunker/indexed.ts new file mode 100644 index 0000000..5d7f7ed --- /dev/null +++ b/src/file/chunker/indexed.ts @@ -0,0 +1,3 @@ +export declare class Indexed { + readonly [n: number]: T +} diff --git a/src/file/chunker/rabin.js b/src/file/chunker/rabin.js index 2274943..5c1b913 100644 --- a/src/file/chunker/rabin.js +++ b/src/file/chunker/rabin.js @@ -1,7 +1,10 @@ - // eslint-disable-next-line no-unused-vars -import * as API from './api.js' -import * as RabinLib from './rabin/rabin-wasm.js' +import * as API from "./api.js" +import * as RabinLib from "./rabin/rabin-wasm.js" + +const AVARAGE = 262144 +const WINDOW = 16 +const POLYNOM = 17437180132763653 /** * @typedef {object} Rabin @@ -13,24 +16,24 @@ import * as RabinLib from './rabin/rabin-wasm.js' * * @typedef {object} RabinConfig * @property {number} avg - * @property {number} [min] - * @property {number} [max] + * @property {number} min + * @property {number} max * @property {number} window * @property {number} polynomial */ /** - * @param {RabinConfig} config + * @param {Partial} config * @returns {Promise>} */ -export const withConfig = async ({ - avg, +export const create = async ({ + avg = AVARAGE, min = avg / 3, - max = avg + (avg / 2), - window, - polynomial -}) => ({ - type: 'Stateful', + max = avg + avg / 2, + window = WINDOW, + polynomial = POLYNOM, +} = {}) => ({ + type: "Stateful", context: await RabinLib.create( Math.floor(Math.log2(avg)), min, @@ -38,7 +41,7 @@ export const withConfig = async ({ window, polynomial ), - cut + cut, }) /** diff --git a/src/file/chunker/rabin.new.js b/src/file/chunker/rabin.new.js new file mode 100644 index 0000000..28272e2 --- /dev/null +++ b/src/file/chunker/rabin.new.js @@ -0,0 +1,85 @@ +import * as API from "./api.js" +import * as Rabin from "rabin-rs" + +const AVARAGE = 262144 +const WINDOW = 16 + +/** + * @typedef {object} Config + * @property {number} avg + * @property {number} min + * @property {number} max + * @property {number} window + * + * @typedef {Rabin.Rabin} Context + */ + +/** + * + * @returns {Config} + */ +export const defaults = () => configure({ avg: AVARAGE }) + +/** + * @param {number} avg + * @returns {RabinConfig} + */ + +/** + * + * @param {Partial & {avg: number}} config + * @returns + */ +export const configure = ({ + avg, + min = (avg / 3) | 0, + max = (avg + avg / 2) | 0, + window = WINDOW, +}) => ({ + avg, + min, + max, + window, +}) + +/** + * @param {Config} config + * @returns {Promise>} + */ +export const create = async (config = defaults()) => ({ + type: "Stateless", + context: await Rabin.create( + Math.floor(Math.log2(config.avg)), + config.min, + config.max, + config.window + ), + cut, +}) + +/** + * @param {BigInt} polynom + * @param {Config} config + * @returns {Promise>} + */ +export const createWithPolynom = async (polynom, config = defaults()) => ({ + type: "Stateless", + context: Object.assign( + await Rabin.createWithPolynom( + polynom, + Math.floor(Math.log2(config.avg)), + config.min, + config.max, + config.window + ), + config + ), + cut, +}) + +/** + * @param {Context} rabin + * @param {Uint8Array} buffer + * @param {boolean} end + */ +export const cut = (rabin, buffer, end) => Rabin.cut(rabin, buffer, end) diff --git a/src/file/layout/api.ts b/src/file/layout/api.ts index 80c345d..b5395ca 100644 --- a/src/file/layout/api.ts +++ b/src/file/layout/api.ts @@ -8,6 +8,7 @@ import type { MultihashHasher, Phantom, } from "../../unixfs.js" +import * as Chunker from "../chunker/api.js" export interface Layout< Options = unknown, @@ -46,7 +47,7 @@ export interface Layout< * DAG node sticking it into `state.nodes`. Importer than will take nodes * that were added and encode them into blocks. */ - write(state: State, leaves: Uint8Array[]): WriteResult + write(state: State, leaves: Chunker.Buffer[]): WriteResult /** * After importer passed all the leaves to builders `write` it will call @@ -76,7 +77,7 @@ export interface Branch { export interface Leaf { id: NodeID - content: Uint8Array + content: Chunker.Buffer children?: void } diff --git a/src/file/layout/balanced.js b/src/file/layout/balanced.js index 1a71392..aa264f8 100644 --- a/src/file/layout/balanced.js +++ b/src/file/layout/balanced.js @@ -1,4 +1,5 @@ import * as Layout from "./api.js" +import * as Chunker from "./../chunker/api.js" /** * Type representing a state of the balanced tree. First row hold leaves coming @@ -138,7 +139,7 @@ export const open = ({ width }) => ({ /** * * @param {Balanced} state - * @param {Uint8Array[]} slices + * @param {Chunker.Buffer[]} slices * @returns {Layout.WriteResult} */ export const write = (state, slices) => { diff --git a/src/file/writer.js b/src/file/writer.js index f0ac63a..2d346ed 100644 --- a/src/file/writer.js +++ b/src/file/writer.js @@ -125,6 +125,7 @@ export const write = (state, bytes) => { // Chunk up provided bytes const chunker = Chunker.append(state.chunker, bytes) const chunks = Chunker.chunks(chunker) + // Pass chunks to layout engine to produce nodes const { nodes, leaves, layout } = state.config.fileLayout.write( state.layout, @@ -260,7 +261,7 @@ const encodeLeaves = (leaves, config) => * @returns {Task.Task} */ const encodeLeaf = function* ({ hasher, createCID }, { id, content }, encoder) { - const bytes = encoder.encode(content) + const bytes = encoder.encode(asUint8Array(content)) const hash = yield* Task.wait(hasher.digest(bytes)) const cid = createCID(encoder.code, hash) @@ -325,3 +326,14 @@ export const writeBlock = function* (blockQueue, block) { } blockQueue.enqueue(block) } + +/** + * + * @param {Uint8Array|Chunker.Buffer} buffer + * @returns + */ + +const asUint8Array = buffer => + buffer instanceof Uint8Array + ? buffer + : buffer.copyTo(new Uint8Array(buffer.byteLength), 0) diff --git a/test/convergence.spec.js b/test/convergence.spec.js index 2f88beb..cffd48f 100644 --- a/test/convergence.spec.js +++ b/test/convergence.spec.js @@ -2,6 +2,7 @@ import { assert } from "chai" import Matrix from "./dataset/convergence_rawdata.js" import * as FileImporter from "../src/file.js" import { parseConfig, unpackFile } from "./matrix.js" +import { CID, collect } from "./util.js" import * as FS from "fs" /** @@ -13,10 +14,11 @@ const createTest = input => * @this {{timeout(ms:number):void}} */ async function test() { - this.timeout(30000) + this.timeout(50000) const config = await parseConfig(input) const { writer, blocks } = FileImporter.createImporter({}, config) const file = await unpackFile(config.url) + collect(blocks) // @ts-expect-error - see https://github.com/DefinitelyTyped/DefinitelyTyped/pull/59057 const stream = /** @type {ReadableStream} */ (file.stream()) @@ -25,7 +27,7 @@ const createTest = input => const read = await reader.read() if (read.done) { const link = await writer.close() - assert.deepEqual(link.cid, config.cid) + assert.deepEqual(toV1(link.cid), config.cid) break } else { writer.write(read.value) @@ -33,14 +35,48 @@ const createTest = input => } } +/** + * @typedef {Matrix[number]} Config + * @param {Config} config + */ + +const isJSRabinTest = config => + config.chunker.startsWith("rabin") && config.impl === "js" + +/** + * @param {Config} config + */ +const isBuzzhashTest = config => config.chunker.startsWith("buzhash") + +/** + * @param {Config} config + */ +const isInlineCIDTest = config => config.inlining > 0 + +/** + * @param {Config} config + */ +const isDisabledTest = config => + isInlineCIDTest(config) || isBuzzhashTest(config) || isJSRabinTest(config) + describe("convergence tests", () => { - for (const input of Matrix.slice(0, 10)) { - const title = `${input.cmd} ${input.source}` + for (const config of Matrix) { + const title = `${ + config.impl === "go" ? "ipfs" : "jsipfs" + } ${config.cmd.trim()} ${config.source}` + const test = createTest(config) - if (input.inlining > 0) { - it.skip(title, createTest(input)) + if (isDisabledTest(config)) { + it.skip(title, test) } else { - it(title, createTest(input)) + it(title, test) } } }) + +/** + * @param {import('../src/unixfs').CID} cid + */ + +const toV1 = cid => + cid.version === 0 ? CID.createV1(cid.code, cid.multihash) : cid diff --git a/test/file.spec.js b/test/file.spec.js index 91f5bf8..32a5445 100644 --- a/test/file.spec.js +++ b/test/file.spec.js @@ -1,13 +1,13 @@ /* eslint-env mocha */ import { expect, assert } from "chai" -import { encodeUTF8, File, CID } from "./util.js" +import { encodeUTF8, File, CID, hashrecur, collect } from "./util.js" import * as unixfs from "../src/lib.js" import * as FileImporter from "../src/file.js" import * as Trickle from "../src/file/layout/trickle.js" import * as Balanced from "../src/file/layout/balanced.js" import * as FixedSize from "../src/file/chunker/fixed.js" -import * as Rabin from "../src/file/chunker/rabin.js" +import * as Rabin from "../src/file/chunker/rabin.new.js" import * as API from "../src/file/api.js" import * as RawLeaf from "multiformats/codecs/raw" import * as UnixFS from "../src/unixfs.js" @@ -15,7 +15,7 @@ import { sha256 } from "multiformats/hashes/sha2" import * as FS from "fs" const CHUNK_SIZE = 262144 -describe("test file importer", () => { +describe("test file", () => { it("basic file", async function () { this.timeout(30000) const content = encodeUTF8("this file does not have much content\n") @@ -93,11 +93,12 @@ describe("test file importer", () => { ) }) - it.only("--chunker=size-65535 --trickle=false --raw-leaves=false --cid-version=1", async () => { + it("--chunker=size-65535 --trickle=false --raw-leaves=false --cid-version=1", async () => { + const chunkSize = 65535 const { writer, blocks } = FileImporter.createImporter( {}, { - chunker: FixedSize.withMaxChunkSize(65535), + chunker: FixedSize.withMaxChunkSize(chunkSize), fileChunkEncoder: FileImporter.UnixFSLeaf, smallFileEncoder: FileImporter.UnixFSLeaf, fileLayout: Balanced, @@ -107,7 +108,7 @@ describe("test file importer", () => { } ) - const size = Math.round(65535 * 2.2) + const size = Math.round(chunkSize * 2.2) const FRAME = Math.round(size / 10) let offset = 0 let n = 0 @@ -126,4 +127,31 @@ describe("test file importer", () => { dagByteLength: 144372, }) }) + + it("chunks with rabin chunker", async function () { + this.timeout(30000) + const content = hashrecur({ + byteLength: CHUNK_SIZE * 2, + }) + const chunker = await Rabin.create() + + const { writer, ...importer } = FileImporter.createImporter( + {}, + FileImporter.configure({ chunker }) + ) + const collector = collect(importer.blocks) + + for await (const slice of content) { + writer.write(slice) + } + const link = await writer.close() + const blocks = await collector + + assert.deepEqual( + link.cid, + CID.parse("bafybeicj5kf4mohavbbh4j5izwy3k23cysewxfhgtmlaoxq6sewx2tsr7u") + ) + + assert.deepEqual((await blocks).length, 4) + }) }) diff --git a/test/file/chunker/buffer.spec.js b/test/file/chunker/buffer.spec.js new file mode 100644 index 0000000..6d9d939 --- /dev/null +++ b/test/file/chunker/buffer.spec.js @@ -0,0 +1,44 @@ +/* eslint-env mocha */ +import { expect, assert } from "chai" +import * as BufferQueue from "../../../src/file/chunker/buffer.js" + +describe("chunker buffer", () => { + it("concat two uint8arrays", () => { + const buffer = BufferQueue.empty() + .push(new Uint8Array(12).fill(1)) + .push(new Uint8Array(8).fill(2)) + + assert.equal(buffer.byteLength, 20) + assert.equal(buffer.length, 20) + + assert.equal(buffer[0], 1) + assert.equal(buffer[1], 1) + assert.equal(buffer[12], 2) + assert.equal(buffer[19], 2) + assert.equal(buffer[20], undefined) + }) + + it("slice", () => { + const buffer = BufferQueue.empty() + .push(new Uint8Array(12).fill(1)) + .push(new Uint8Array(8).fill(2)) + + const s0 = buffer.slice(0, 0) + assert.equal(s0.byteLength, 0) + + const s1 = buffer.slice(0, 3) + assert.deepEqual(s1.byteLength, 3) + assert.deepEqual([...s1], [1, 1, 1]) + + const s2 = buffer.slice(3, 13) + assert.equal(s2.byteLength, 10) + assert.deepEqual([...s2], [1, 1, 1, 1, 1, 1, 1, 1, 1, 2]) + + const s3 = [ + ...new Uint8Array(12).fill(1), + ...new Uint8Array(8).fill(2), + ].slice(3, 13) + + assert.deepEqual([...s3], [...s2]) + }) +}) diff --git a/test/matrix.js b/test/matrix.js index 9278c40..ee6b606 100644 --- a/test/matrix.js +++ b/test/matrix.js @@ -2,7 +2,7 @@ import { CID, File, fetch } from "./util.js" import * as Trickle from "../src/file/layout/trickle.js" import * as Balanced from "../src/file/layout/balanced.js" import * as FixedSize from "../src/file/chunker/fixed.js" -import * as Rabin from "../src/file/chunker/rabin.js" +import * as Rabin from "../src/file/chunker/rabin.new.js" import * as API from "../src/file/api.js" import { UnixFSLeaf } from "../src/file.js" import * as RawLeaf from "multiformats/codecs/raw" @@ -81,14 +81,15 @@ const parseChunker = input => { .split("-") .map(n => parseInt(n)) - return Rabin.withConfig({ - avg, - min, - max, - window: 64, - // @see https://github.com/hugomrdias/rabin-wasm/blob/f0cf7ce248a268cc65c389ece6882df25f92fc02/assembly/index.ts#L144 - polynomial: 0x3da3358b4dc173, - }) + return Rabin.create( + Rabin.configure({ + avg, + min, + max, + }) + ) + } else if (input === "rabin") { + return Rabin.create() } else { throw new Error(`Unknown chunker ${input}`) } diff --git a/test/util.js b/test/util.js index 73c33a2..3218c4f 100644 --- a/test/util.js +++ b/test/util.js @@ -1,6 +1,7 @@ import { File } from "@web-std/file" import { CID } from "multiformats" import { fetch } from "@web-std/fetch" +import { sha256 } from "multiformats/hashes/sha2" const utf8Encoder = new TextEncoder() @@ -9,4 +10,48 @@ const utf8Encoder = new TextEncoder() */ export const encodeUTF8 = input => utf8Encoder.encode(input) +/** + * Utility function to generate deterministic garbage by hashing seed input, + * then recursively hashing the product until total number of bytes yield + * reaches `byteLength` (by default it is inifinity). + * + * @param {object} [options] + * @param {Uint8Array} [options.seed] + * @param {number} [options.byteLength] + */ +export async function* hashrecur({ + seed = encodeUTF8("hello world"), + byteLength = Infinity, +} = {}) { + let value = seed + let byteOffset = 0 + while (true) { + value = await sha256.encode(value) + const size = byteLength - byteOffset + if (size < value.byteLength) { + yield value.slice(0, size) + break + } else { + byteOffset += value.byteLength + yield value + } + } +} + +/** + * @param {ReadableStream} blockQueue + */ +export const collect = async blockQueue => { + const blocks = [] + const reader = blockQueue.getReader() + while (true) { + const next = await reader.read() + if (next.done) { + return blocks + } else { + blocks.push(next.value) + } + } +} + export { File, CID, fetch } diff --git a/yarn.lock b/yarn.lock index 2b82ff9..1d1655b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -16,7 +16,7 @@ "@assemblyscript/loader@^0.9.4": version "0.9.4" - resolved "https://registry.npmjs.org/@assemblyscript/loader/-/loader-0.9.4.tgz" + resolved "https://registry.yarnpkg.com/@assemblyscript/loader/-/loader-0.9.4.tgz#a483c54c1253656bb33babd464e3154a173e1577" integrity sha512-HazVq9zwTVwGmqdwYzu7WyQ6FQVZ7SwET0KKQuKm55jD0IfUpZgN0OPIiZG3zV1iSrVYcN0bdwLRXI/VNCYsUA== "@babel/code-frame@^7.16.7": @@ -1979,7 +1979,7 @@ nanoid@^3.1.30: node-fetch@^2.6.1: version "2.6.7" - resolved "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.7.tgz" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad" integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ== dependencies: whatwg-url "^5.0.0" @@ -2353,9 +2353,14 @@ queue-microtask@^1.2.2: resolved "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz" integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A== +rabin-rs@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/rabin-rs/-/rabin-rs-2.0.0.tgz#524ab65c973a9988b8e13f4cca995eb6daf2ea22" + integrity sha512-O79/sGci9PtyADBdL8bs142FuoL2NrK2HjSDHseFYW8RiDfdqQy0T5J/RiczK91RKgVCDlNiYHFRXR4/QnUo+w== + rabin-wasm@^0.1.5: version "0.1.5" - resolved "https://registry.npmjs.org/rabin-wasm/-/rabin-wasm-0.1.5.tgz" + resolved "https://registry.yarnpkg.com/rabin-wasm/-/rabin-wasm-0.1.5.tgz#5b625ca007d6a2cbc1456c78ae71d550addbc9c9" integrity sha512-uWgQTo7pim1Rnj5TuWcCewRDTf0PEFTSlaUjWP4eY9EbLV9em08v89oCz/WO+wRxpYuO36XEHp4wgYQnAgOHzA== dependencies: "@assemblyscript/loader" "^0.9.4" @@ -2775,7 +2780,7 @@ totalist@^1.0.0: tr46@~0.0.3: version "0.0.3" - resolved "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz" + resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" integrity sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o= trouter@^2.0.1: @@ -2893,12 +2898,12 @@ web-streams-polyfill@^3.1.1: webidl-conversions@^3.0.0: version "3.0.1" - resolved "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz" + resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" integrity sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE= whatwg-url@^5.0.0: version "5.0.0" - resolved "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz" + resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" integrity sha1-lmRU6HZUYuN2RNNib2dCzotwll0= dependencies: tr46 "~0.0.3"