Skip to content

Commit

Permalink
feat(core): support hard links to dedupe data
Browse files Browse the repository at this point in the history
  • Loading branch information
blacha committed Nov 11, 2021
1 parent d1a1f2a commit ba573a1
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 14 deletions.
32 changes: 20 additions & 12 deletions packages/core/src/binary/binary.index.builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,43 @@ export const CotarIndexBuilder = {
logger?: LogType,
): Promise<TarIndexResult> {
if (typeof getBytes !== 'function') getBytes = TarReader.toFileReader(getBytes);
let fileCount = 0;
let currentTime = Date.now();
const files = [];

// Loop over every file in the tar archive create a hash and validating there are no collisions
const hashSeen = new Map();
for await (const ctx of TarReader.iterate(getBytes)) {
if (ctx.header.type !== TarReader.Type.File) continue;
fileCount++;
const hash = CotarIndex.hash(ctx.header.path);
if (hashSeen.has(hash)) {
throw new Error('HashCollision:' + hashSeen.get(hash) + ' and ' + ctx.header.path);
if (ctx.header.type === TarReader.Type.HardLink) {
const hash = CotarIndex.hash(ctx.header.linkName);
const target = hashSeen.get(hash);
if (target == null) throw new Error('Link to unknown file: ' + ctx.header.linkName);
files.push({ ...target, hash: CotarIndex.hash(ctx.header.path) });
} else if (ctx.header.type === TarReader.Type.File) {
// console.log(ctx.header.path);
const hash = CotarIndex.hash(ctx.header.path);
const fileObj = { hash, path: ctx.header.path, offset: ctx.offset, size: ctx.header.size, index: -1 };
if (hashSeen.has(hash)) {
throw new Error('HashCollision:' + hashSeen.get(hash).path + ' and ' + ctx.header.path);
} else {
hashSeen.set(hash, fileObj);
}
files.push(fileObj);
} else {
hashSeen.set(hash, ctx.header.path);
continue;
}
files.push({ hash, path: ctx.header.path, offset: ctx.offset, size: ctx.header.size, index: -1 });

if (fileCount % 25_000 === 0 && logger != null) {
if (files.length % 25_000 === 0 && logger != null) {
const duration = Date.now() - currentTime;
currentTime = Date.now();
logger.debug({ current: fileCount, duration }, 'Cotar.Index:ReadTar');
logger.debug({ current: files.length, duration }, 'Cotar.Index:ReadTar');
}
}
hashSeen.clear();

const packingFactor = opts?.packingFactor ?? TarReader.PackingFactor;
const slotCount = Math.ceil(fileCount * packingFactor);
const slotCount = Math.ceil(files.length * packingFactor);
const outputBuffer = Buffer.alloc(IndexSize + IndexRecordSize * slotCount);
logger?.debug({ slotCount, fileCount }, 'Cotar.index:Allocate');
logger?.debug({ slotCount, fileCount: files.length }, 'Cotar.index:Allocate');

// Allocate the hash slots for the files
currentTime = Date.now();
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/tar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export const TarHeader = bp.object('TarHeader', {
skip2: bp.skip(20),
// unk1: bp.skip(8),
type: bp.u8,
// linkName: bp.string(100),
linkName: bp.string(100),
// magic: bp.string(6),
// version: bp.bytes(2),
// uName: bp.string(32),
Expand Down Expand Up @@ -72,7 +72,7 @@ export const TarReader = {
if (TarType[head.type] == null) {
throw new Error('Unknown header @ ' + toHex(ctx.offset) + ' type:' + head.type);
}
if (head.type === TarType.File) yield { header: head, offset: ctx.offset };
yield { header: head, offset: ctx.offset };

ctx.offset += head.size;
}
Expand Down

0 comments on commit ba573a1

Please sign in to comment.