feat(core): create version 2 header switching to uint32 for block off…

…set and file size
linz · Feb 17, 2022 · d0ea49b · d0ea49b
1 parent 04cd0c8
commit d0ea49b
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -27,19 +27,31 @@ This makes it very easy to add new files to a archive as more files can just be
 ![TarFileBackground](./static/TarFileBackground.png)
 
 ### Tar Index
-TAR Index (.cotar.index) is a binary file containing the location and size of a file inside of a tar. with this index a tar file can be randomly read.
+TAR Index (.index) is a binary file containing the location and size of a file inside of a tar. with this index a tar file can be randomly read.
 
 ![TarFileIndex](./static/TarFileIndex.png)
 
 
+### Limits
+
+- Internal file count is limited to `uint32` this limits the tar to ~4 Billion files
+- Internal file sizes are limited to `uint32` this stops the indexer from indexing files inside the tar that are over 4GB in size
+- Internal File block offsets are limited to `uint32` however tar's are block aligned at 512 byte intervals so the final tar size is limited to 200TB
+
+
+### V1 To V2
+
+V1 was using 24 bytes per index record, uint64 for hash, file offset and file size. This was inefficent as most files inside the archives are < 100KB in size 
+
+V2 moves to a uint32 for offsets, it changes from raw byte offset in the file to block offset. tar files are block algined at 512byte intervals. to get the raw offset `const rawOffset = blockOffset * 512` this limits the tar to 200TB in size.
+
+V2 moves to uint32 for file size this limits internal files to 4GB in size
+
 ### Performance
 
 Performance regression is monitored with [hyperfine-action](https://github.com/blacha/hyperfine-action) with results being hosted on github pages [benchmarks.html](https://linz.github.io/cotar/benchmarks.html)
 
 #### Questions:
-**Offset size `uint32` vs `uint64`**
-Large files will need large offsets 64 bit offsets give huge file support but need quite a few more bytes per record to store than `uint32`, for smaller files a `uint32` or `uint16` may be enough
-
 **Hash size**
 The type of the hash could be changed as well as the number of bits of the hash used based on how unique the file hashes are, a uint64 hash is mostly completely wasted on a tar file containing 100 files. 
 conversely a tar file containing 2,000,000 files needs a hash much larger than 16bits
@@ -49,20 +61,6 @@ conversely a tar file containing 2,000,000 files needs a hash much larger than 1
 Any hash type could be used `farmhash` or even `sha256` and then the bits sliced down to the number needed for the hash index.
 
 
-**Configuring the record size**
-Based on Offset size, hash size and type, these could be configured in the index's header/footer by putting the number of bytes needed for offset/hash/size as variables into the header. 
-This will slightly add to the index size but the main issue it adds to the complexity of reading the file.
-
-for example the next generation header could look like
-```
-Magic: "COT"
-version: 0x02
-count: 0x72365123 // uint32 for record count (Limited to ~4 billion files)
-offset: 0x04 // 4 bytes for offset (uint32)
-size: 0x02 // 2 byte for size (uint16)
-hash: 0x08 // 8 bytes for hash (uint64)
-```
-
 ## Future investigation
 
 1. Zip files
@@ -79,4 +77,8 @@ See: https://github.com/tapalcatl/tapalcatl-2-spec
 
 Having a single tar file greatly simplifies the distribution of the files, It would be quite simple to tar both the index (.tar.index) and data tar into another tar to combine the files into a single distribution
 
+> 2022-01 done
+
 3. Use AWS S3's response-encoding to decompress internal gziped content on the fly
+
+
diff --git a/packages/core/src/__benchmark__/tar.validate.ts b/packages/core/src/__benchmark__/tar.validate.ts
@@ -10,7 +10,7 @@ async function main(): Promise<void> {
   const res = await CotarIndexBuilder.create(fd);
 
   const source = new SourceMemory('Memory', res.buffer);
-  const cotarIndex = new CotarIndexBinary(source, res.count);
+  const cotarIndex = new CotarIndexBinary(source, { version: 2, count: res.count, magic: 'COT' });
 
   for (let i = 0; i < 50; i++) {
     await TarReader.validate(fd, cotarIndex);

diff --git a/packages/core/src/binary/__test__/binary.header.test.ts b/packages/core/src/binary/__test__/binary.header.test.ts
@@ -4,11 +4,19 @@ import { CotarMetadataParser } from '../binary.index.js';
 
 const Example = {
   v1: {
-    buf: Buffer.from('Q09UAdIClkk=', 'base64'),
+    buf: Buffer.from('Q09UAtIClkk=', 'base64'),
     header: {
       magic: 'COT',
       count: 1234567890,
-      version: 1,
+      version: 2,
+    },
+  },
+  v2: {
+    buf: Buffer.from('Q09UAtIClkk=', 'base64'),
+    header: {
+      magic: 'COT',
+      count: 1234567890,
+      version: 2,
     },
   },
 };
@@ -18,9 +26,9 @@ o.spec('CotarBinaryHeaderFooter', () => {
     const header = Buffer.alloc(8);
     const count = 1234567890;
     header.write('COT', 0);
-    header.writeUInt8(1, 3);
+    header.writeUInt8(2, 3);
     header.writeUInt32LE(count, 4);
-    o(header.toString('base64')).equals(Example.v1.buf.toString('base64'));
+    o(header.toString('base64')).equals(Example.v2.buf.toString('base64'));
   });
 
   o('should parse v1 header', () => {
@@ -29,19 +37,26 @@ o.spec('CotarBinaryHeaderFooter', () => {
     o(header.value).deepEquals(Example.v1.header);
   });
 
+  o('should parse v2 header', () => {
+    const header = CotarMetadataParser.read(Example.v2.buf);
+    o(header.offset).equals(8);
+    o(header.value).deepEquals(Example.v2.header);
+  });
+
   o('should write a header and a footer', () => {
     const buf = Buffer.alloc(32);
-    writeHeaderFooter(buf, Example.v1.header.count);
+    writeHeaderFooter(buf, Example.v2.header.count);
 
     const buf64 = buf.toString('base64');
+    console.log(buf64);
     // Should start and end with the same data
-    o(buf64.startsWith('Q09UAdIClkk')).equals(true);
-    o(buf64.endsWith('Q09UAdIClkk=')).equals(true);
+    o(buf64.startsWith('Q09UAtIClkk')).equals(true);
+    o(buf64.endsWith('Q09UAtIClkk=')).equals(true);
 
     const headStart = CotarMetadataParser.read(buf);
     const headEnd = CotarMetadataParser.read(buf, buf.length - 8);
 
-    o(headStart.value).deepEquals(Example.v1.header);
-    o(headEnd.value).deepEquals(Example.v1.header);
+    o(headStart.value).deepEquals(Example.v2.header);
+    o(headEnd.value).deepEquals(Example.v2.header);
   });
 });
diff --git a/packages/core/src/binary/__test__/binary.test.ts b/packages/core/src/binary/__test__/binary.test.ts
@@ -11,59 +11,58 @@ import { Cotar } from '../../cotar.js';
 import { TarReader } from '../../tar.js';
 import { CotarIndexBuilder, writeHeaderFooter } from '../binary.index.builder.js';
 import { CotarIndex, toNumber } from '../binary.index.js';
-import { IndexHeaderSize, IndexRecordSize } from '../format.js';
+import { IndexHeaderSize, IndexV2RecordSize } from '../format.js';
 const __dirname = path.dirname(url.fileURLToPath(import.meta.url));
 
 function abToChar(buf: ArrayBuffer | null, offset: number): string | null {
   if (buf == null) return null;
   return String.fromCharCode(new Uint8Array(buf)[offset]);
 }
 
-const ExpectedRecord =
-  'Q09UAQQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAmYwdbtIi0pwAAAAAAAAAAAQAAAAAAAAC/I5YiYFMqNwQAAAAAAAAABAAAAAAAAABDT1QBBAAAAA==';
+const ExpectedRecordV2 =
+  'Q09UAgQAAAB0wPmDP22WfQIAAAAIAAAAAAAAAAAAAAAAAAAAAAAAACZjB1u0iLSnAAAAAAEAAAC/I5YiYFMqNwEAAAAEAAAAQ09UAgQAAAA=';
 
 o.spec('CotarBinary.fake', () => {
   o('should load a tile from fake index', async () => {
     // Manually create a fake binary index
     const files = [
       { path: 'tiles/0/0/0.pbf.gz', offset: 0, size: 1 },
-      { path: 'tiles/1/1/1.pbf.gz', offset: 4, size: 4 },
+      { path: 'tiles/1/1/1.pbf.gz', offset: 512, size: 4 },
+      { path: 'tiles/1/1/2.pbf.gz', offset: 1024, size: 8 },
     ];
 
     const indexSize = 4;
-    const tarIndex: Buffer = Buffer.alloc(indexSize * IndexRecordSize + IndexHeaderSize * 2);
+    const tarIndex: Buffer = Buffer.alloc(indexSize * IndexV2RecordSize + IndexHeaderSize * 2);
 
     for (const record of files) {
       const hash = fnv1a(record.path, { size: 64 });
       const index = Number(hash % BigInt(indexSize));
-      const offset = index * IndexRecordSize + IndexHeaderSize;
+      const offset = index * IndexV2RecordSize + IndexHeaderSize;
       tarIndex.writeBigUInt64LE(hash, offset);
-      tarIndex.writeBigUInt64LE(BigInt(record.offset), offset + 8);
-      tarIndex.writeBigUInt64LE(BigInt(record.size), offset + 16);
+      tarIndex.writeUInt32LE(record.offset / 512, offset + 8);
+      tarIndex.writeUInt32LE(record.size, offset + 12);
     }
     tarIndex.writeUInt32LE(indexSize);
 
     writeHeaderFooter(tarIndex, indexSize);
 
-    o(tarIndex.toString('base64')).equals(ExpectedRecord);
+    console.log(tarIndex.toString('base64'));
+
+    o(tarIndex.toString('base64')).equals(ExpectedRecordV2);
 
     const cotar = new Cotar(
       new SourceMemory('Tar', Buffer.from('0123456789')),
       await CotarIndex.create(new SourceMemory('index', tarIndex)),
     );
 
     o(await cotar.index.find('tiles/0/0/0.pbf.gz')).deepEquals({ offset: 0, size: 1 });
-    o(await cotar.index.find('tiles/1/1/1.pbf.gz')).deepEquals({ offset: 4, size: 4 });
+    o(await cotar.index.find('tiles/1/1/1.pbf.gz')).deepEquals({ offset: 512, size: 4 });
+    o(await cotar.index.find('tiles/1/1/2.pbf.gz')).deepEquals({ offset: 1024, size: 8 });
     o(await cotar.index.find('tiles/1/1/3.pbf.gz')).equals(null);
 
     const tile0 = await cotar.get('tiles/0/0/0.pbf.gz');
     o(tile0).notEquals(null);
     o(abToChar(tile0, 0)).equals('0');
-
-    const tile1 = await cotar.get('tiles/1/1/1.pbf.gz');
-    o(tile1).notEquals(null);
-    o(tile1!.byteLength).equals(4);
-    o(abToChar(tile1, 0)).equals('4');
   });
 });
 

diff --git a/packages/core/src/binary/binary.index.builder.ts b/packages/core/src/binary/binary.index.builder.ts
@@ -2,7 +2,7 @@ import { LogType, SourceMemory } from '@chunkd/core';
 import { AsyncFileDescriptor, AsyncFileRead, AsyncReader, TarIndexResult } from '../tar.index.js';
 import { TarReader } from '../tar.js';
 import { CotarIndex } from './binary.index.js';
-import { IndexHeaderSize, IndexMagic, IndexRecordSize, IndexSize, IndexVersion } from './format.js';
+import { IndexHeaderSize, IndexMagic, IndexV2RecordSize, IndexSize, IndexVersion } from './format.js';
 
 /** Write the header/footer into the buffer */
 export function writeHeaderFooter(output: Buffer, count: number): void {
@@ -74,7 +74,7 @@ export const CotarIndexBuilder = {
 
     const packingFactor = opts?.packingFactor ?? TarReader.PackingFactor;
     const slotCount = Math.ceil(files.length * packingFactor);
-    const outputBuffer = Buffer.alloc(IndexSize + IndexRecordSize * slotCount);
+    const outputBuffer = Buffer.alloc(IndexSize + IndexV2RecordSize * slotCount);
     logger?.debug({ slotCount, fileCount: files.length }, 'Cotar.index:Allocate');
 
     // Allocate the hash slots for the files
@@ -103,7 +103,7 @@ export const CotarIndexBuilder = {
       let searchCount = 0;
       while (true) {
         if (index >= slotCount) index = 0;
-        if (outputBuffer.readBigUInt64LE(index * IndexRecordSize + IndexHeaderSize) === Big0) break;
+        if (outputBuffer.readBigUInt64LE(index * IndexV2RecordSize + IndexHeaderSize) === Big0) break;
         searchCount++;
         index++;
 
@@ -117,10 +117,10 @@ export const CotarIndexBuilder = {
       }
       biggestSearch = Math.max(biggestSearch, searchCount);
 
-      const offset = index * IndexRecordSize + IndexHeaderSize;
+      const offset = index * IndexV2RecordSize + IndexHeaderSize;
       outputBuffer.writeBigUInt64LE(BigInt(file.hash), offset);
-      outputBuffer.writeBigUInt64LE(BigInt(file.offset), offset + 8); // TODO write uint8/16/24 based off size
-      outputBuffer.writeBigUInt64LE(BigInt(file.size), offset + 16);
+      outputBuffer.writeUInt32LE(file.offset / 512, offset + 8); // Tar files are block aligned to 512 bytes
+      outputBuffer.writeUInt32LE(file.size, offset + 12);
 
       if (i > 0 && i % 100_000 === 0 && logger != null) {
         const duration = Date.now() - currentTime;

diff --git a/packages/core/src/binary/binary.index.ts b/packages/core/src/binary/binary.index.ts
@@ -2,7 +2,7 @@ import { ChunkSource, LogType } from '@chunkd/core';
 import fnv1a from '@sindresorhus/fnv1a';
 import { bp, StrutInfer } from 'binparse';
 import { CotarIndexRecord } from '../cotar.js';
-import { IndexHeaderSize, IndexMagic, IndexRecordSize, IndexVersion } from './format.js';
+import { IndexHeaderSize, IndexMagic, IndexV2RecordSize, IndexV1RecordSize, IndexVersion } from './format.js';
 
 const Big0 = BigInt(0);
 
@@ -20,12 +20,12 @@ export class CotarIndex {
 
   /** Should the metadata be read from the header or the footer */
   isHeader = true;
-  size: number;
+  metadata: CotarMetadata;
 
-  constructor(source: ChunkSource, size: number, sourceOffset = 0) {
+  constructor(source: ChunkSource, metadata: CotarMetadata, sourceOffset = 0) {
     this.source = source;
     this.sourceOffset = sourceOffset;
-    this.size = size;
+    this.metadata = metadata;
   }
 
   static hash(path: string): bigint {
@@ -58,7 +58,7 @@ export class CotarIndex {
 
   static async create(source: ChunkSource, sourceOffset = 0, isHeader = true): Promise<CotarIndex> {
     const metadata = await this.getMetadata(source, sourceOffset, isHeader);
-    return new CotarIndex(source, metadata.count, sourceOffset);
+    return new CotarIndex(source, metadata, sourceOffset);
   }
 
   /**
@@ -67,16 +67,53 @@ export class CotarIndex {
    * @returns the index if found, null otherwise
    */
   async find(fileName: string, logger?: LogType): Promise<CotarIndexRecord | null> {
+    if (this.metadata.version === 1) return this._findV1(fileName, logger);
+    if (this.metadata.version === 2) return this._findV2(fileName, logger);
+    throw new Error('Invalid metadata version');
+  }
+
+  async _findV2(fileName: string, logger?: LogType): Promise<CotarIndexRecord | null> {
+    const hash = CotarIndex.hash(fileName);
+
+    const slotCount = this.metadata.count;
+    const startIndex = Number(hash % BigInt(slotCount));
+    let startHash: BigInt | null = null;
+
+    let index = startIndex;
+    while (true) {
+      const offset = this.sourceOffset + index * IndexV2RecordSize + IndexHeaderSize;
+      await this.source.loadBytes(offset, IndexV2RecordSize, logger);
+      startHash = this.source.getBigUint64(offset);
+
+      // Found the file
+      if (startHash === hash) {
+        // Tar offsets are block aligned to 512byte blocks
+        const fileOffset = this.source.getUint32(offset + 8) * 512;
+        const fileSize = this.source.getUint32(offset + 12);
+        return { offset: fileOffset, size: fileSize };
+      }
+      // Found a gap in the hash table (file doesnt exist)
+      if (startHash === Big0) return null;
+
+      index++;
+      // Loop around if we hit the end of the hash table
+      if (index >= slotCount) index = 0;
+      if (index === startIndex) return null;
+    }
+  }
+
+  // TODO(2022-02) this should be removed once we migrate from v1
+  async _findV1(fileName: string, logger?: LogType): Promise<CotarIndexRecord | null> {
     const hash = CotarIndex.hash(fileName);
 
-    const slotCount = this.size;
+    const slotCount = this.metadata.count;
     const startIndex = Number(hash % BigInt(slotCount));
     let startHash: BigInt | null = null;
 
     let index = startIndex;
     while (true) {
-      const offset = this.sourceOffset + index * IndexRecordSize + IndexHeaderSize;
-      await this.source.loadBytes(offset, IndexRecordSize, logger);
+      const offset = this.sourceOffset + index * IndexV1RecordSize + IndexHeaderSize;
+      await this.source.loadBytes(offset, IndexV1RecordSize, logger);
       startHash = this.source.getBigUint64(offset);
 
       // Found the file

diff --git a/packages/core/src/binary/format.ts b/packages/core/src/binary/format.ts
@@ -1,5 +1,5 @@
 export const IndexMagic = 'COT';
-export const IndexVersion = 1;
+export const IndexVersion = 2;
 
 /** Number of bytes used to represent a Header/Footer */
 export const IndexHeaderSize = 8;
@@ -8,4 +8,6 @@ export const IndexFooterSize = IndexHeaderSize;
 export const IndexSize = IndexHeaderSize + IndexFooterSize;
 
 /** 8 bytes hash, 8 bytes offset, 8 bytes size */
-export const IndexRecordSize = 24;
+export const IndexV1RecordSize = 24;
+/** 8 bytes hash, 4 bytes offset, 4 bytes size*/
+export const IndexV2RecordSize = 16;
diff --git a/packages/core/src/cotar.ts b/packages/core/src/cotar.ts
@@ -1,6 +1,6 @@
 import { ChunkSource, LogType } from '@chunkd/core';
 import { CotarIndex } from './binary/binary.index.js';
-import { IndexRecordSize, IndexSize } from './binary/format.js';
+import { IndexV2RecordSize, IndexSize } from './binary/format.js';
 
 export interface CotarIndexRecord {
   offset: number;
@@ -24,8 +24,8 @@ export class Cotar {
     // Load the last file in the tar archive
     const metadata = await CotarIndex.getMetadata(source, 0, false);
     const size = await source.size;
-    const startOffset = size - (metadata.count * IndexRecordSize + IndexSize);
-    const index = new CotarIndex(source, metadata.count, startOffset);
+    const startOffset = size - (metadata.count * IndexV2RecordSize + IndexSize);
+    const index = new CotarIndex(source, metadata, startOffset);
 
     return new Cotar(source, index);
   }

diff --git a/static/TarFileIndex.png b/static/TarFileIndex.png