From 00e051bceaf0791ffc1b08b36aee196ccdd95606 Mon Sep 17 00:00:00 2001 From: Bjorn Stromberg Date: Sat, 6 Jul 2024 19:16:45 +0800 Subject: [PATCH] Replace Buffer usage with Uint8Array (#633) Co-authored-by: Borewit --- core.d.ts | 4 ++-- core.js | 60 ++++++++++++++++++++++++++++--------------------- index.test-d.ts | 4 +--- package.json | 7 +++--- readme.md | 12 +++++----- test.js | 35 ++++++++++++++++++----------- util.js | 12 +++++----- 7 files changed, 76 insertions(+), 58 deletions(-) diff --git a/core.d.ts b/core.d.ts index 3e8375df..9247a843 100644 --- a/core.d.ts +++ b/core.d.ts @@ -323,13 +323,13 @@ export type ReadableStreamWithFileType = ReadableStream & { }; /** -Detect the file type of a `Buffer`, `Uint8Array`, or `ArrayBuffer`. +Detect the file type of a `Uint8Array`, or `ArrayBuffer`. The file type is detected by checking the [magic number](https://en.wikipedia.org/wiki/Magic_number_(programming)#Magic_numbers_in_files) of the buffer. If file access is available, it is recommended to use `.fromFile()` instead. -@param buffer - An Uint8Array or Buffer representing file data. It works best if the buffer contains the entire file. It may work with a smaller portion as well. +@param buffer - An Uint8Array or ArrayBuffer representing file data. It works best if the buffer contains the entire file. It may work with a smaller portion as well. @returns The detected file type, or `undefined` when there is no match. */ export function fileTypeFromBuffer(buffer: Uint8Array | ArrayBuffer): Promise; diff --git a/core.js b/core.js index 34f4215a..5f92e5f9 100644 --- a/core.js +++ b/core.js @@ -1,6 +1,6 @@ -import {Buffer} from 'node:buffer'; import * as Token from 'token-types'; import * as strtok3 from 'strtok3/core'; +import {includes, indexOf, getUintBE} from 'uint8array-extras'; import { stringToBytes, tarHeaderChecksumMatches, @@ -75,7 +75,7 @@ export class FileTypeParser { async fromBuffer(input) { if (!(input instanceof Uint8Array || input instanceof ArrayBuffer)) { - throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`Buffer\` or \`ArrayBuffer\`, got \`${typeof input}\``); + throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`ArrayBuffer\`, got \`${typeof input}\``); } const buffer = input instanceof Uint8Array ? input : new Uint8Array(input); @@ -116,7 +116,7 @@ export class FileTypeParser { const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); // Read the input stream and detect the filetype - const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? Buffer.alloc(0); + const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? new Uint8Array(0); try { pass.fileType = await this.fromBuffer(chunk); } catch (error) { @@ -145,7 +145,7 @@ export class FileTypeParser { } async parse(tokenizer) { - this.buffer = Buffer.alloc(minimumBytes); + this.buffer = new Uint8Array(minimumBytes); // Keep reading until EOF if the file size is unknown. if (tokenizer.fileInfo.size === undefined) { @@ -372,12 +372,14 @@ export class FileTypeParser { while (tokenizer.position + 30 < tokenizer.fileInfo.size) { await tokenizer.readBuffer(this.buffer, {length: 30}); + const view = new DataView(this.buffer.buffer); + // https://en.wikipedia.org/wiki/Zip_(file_format)#File_headers const zipHeader = { - compressedSize: this.buffer.readUInt32LE(18), - uncompressedSize: this.buffer.readUInt32LE(22), - filenameLength: this.buffer.readUInt16LE(26), - extraFieldLength: this.buffer.readUInt16LE(28), + compressedSize: view.getUint32(18, true), + uncompressedSize: view.getUint32(22, true), + filenameLength: view.getUint16(26, true), + extraFieldLength: view.getUint16(28, true), }; zipHeader.filename = await tokenizer.readToken(new Token.StringType(zipHeader.filenameLength, 'utf-8')); @@ -472,7 +474,8 @@ export class FileTypeParser { while (nextHeaderIndex < 0 && (tokenizer.position < tokenizer.fileInfo.size)) { await tokenizer.peekBuffer(this.buffer, {mayBeLess: true}); - nextHeaderIndex = this.buffer.indexOf('504B0304', 0, 'hex'); + nextHeaderIndex = indexOf(this.buffer, new Uint8Array([0x50, 0x4B, 0x03, 0x04])); + // Move position to the next header if found, skip the whole buffer otherwise await tokenizer.ignore(nextHeaderIndex >= 0 ? nextHeaderIndex : this.buffer.length); } @@ -495,7 +498,7 @@ export class FileTypeParser { if (this.checkString('OggS')) { // This is an OGG container await tokenizer.ignore(28); - const type = Buffer.alloc(8); + const type = new Uint8Array(8); await tokenizer.readBuffer(type); // Needs to be before `ogg` check @@ -576,7 +579,7 @@ export class FileTypeParser { ) { // They all can have MIME `video/mp4` except `application/mp4` special-case which is hard to detect. // For some cases, we're specific, everything else falls to `video/mp4` with `mp4` extension. - const brandMajor = this.buffer.toString('binary', 8, 12).replace('\0', ' ').trim(); + const brandMajor = new Token.StringType(4, 'latin1').get(this.buffer, 8).replace('\0', ' ').trim(); switch (brandMajor) { case 'avif': case 'avis': @@ -706,11 +709,11 @@ export class FileTypeParser { try { await tokenizer.ignore(1350); const maxBufferSize = 10 * 1024 * 1024; - const buffer = Buffer.alloc(Math.min(maxBufferSize, tokenizer.fileInfo.size)); + const buffer = new Uint8Array(Math.min(maxBufferSize, tokenizer.fileInfo.size)); await tokenizer.readBuffer(buffer, {mayBeLess: true}); // Check if this is an Adobe Illustrator file - if (buffer.includes(Buffer.from('AIPrivateData'))) { + if (includes(buffer, new TextEncoder().encode('AIPrivateData'))) { return { ext: 'ai', mime: 'application/postscript', @@ -765,27 +768,31 @@ export class FileTypeParser { async function readField() { const msb = await tokenizer.peekNumber(Token.UINT8); let mask = 0x80; - let ic = 0; // 0 = A, 1 = B, 2 = C, 3 - // = D + let ic = 0; // 0 = A, 1 = B, 2 = C, 3 = D while ((msb & mask) === 0 && mask !== 0) { ++ic; mask >>= 1; } - const id = Buffer.alloc(ic + 1); + const id = new Uint8Array(ic + 1); await tokenizer.readBuffer(id); return id; } async function readElement() { - const id = await readField(); + const idField = await readField(); const lengthField = await readField(); + lengthField[0] ^= 0x80 >> (lengthField.length - 1); const nrLength = Math.min(6, lengthField.length); // JavaScript can max read 6 bytes integer + + const idView = new DataView(idField.buffer); + const lengthView = new DataView(lengthField.buffer, lengthField.length - nrLength, nrLength); + return { - id: id.readUIntBE(0, id.length), - len: lengthField.readUIntBE(lengthField.length - nrLength, nrLength), + id: getUintBE(idView), + len: getUintBE(lengthView), }; } @@ -793,7 +800,7 @@ export class FileTypeParser { while (children > 0) { const element = await readElement(); if (element.id === 0x42_82) { - const rawValue = await tokenizer.readToken(new Token.StringType(element.len, 'utf-8')); + const rawValue = await tokenizer.readToken(new Token.StringType(element.len)); return rawValue.replaceAll(/\00.*$/g, ''); // Return DocType } @@ -1059,7 +1066,7 @@ export class FileTypeParser { } if (this.checkString('AC')) { - const version = this.buffer.toString('binary', 2, 6); + const version = new Token.StringType(4, 'latin1').get(this.buffer, 2); if (version.match('^d*') && version >= 1000 && version <= 1050) { return { ext: 'dwg', @@ -1126,7 +1133,7 @@ export class FileTypeParser { async function readChunkHeader() { return { length: await tokenizer.readToken(Token.INT32_BE), - type: await tokenizer.readToken(new Token.StringType(4, 'binary')), + type: await tokenizer.readToken(new Token.StringType(4, 'latin1')), }; } @@ -1213,7 +1220,7 @@ export class FileTypeParser { // ASF_Header_Object first 80 bytes if (this.check([0x30, 0x26, 0xB2, 0x75, 0x8E, 0x66, 0xCF, 0x11, 0xA6, 0xD9])) { async function readHeader() { - const guid = Buffer.alloc(16); + const guid = new Uint8Array(16); await tokenizer.readBuffer(guid); return { id: guid, @@ -1228,7 +1235,7 @@ export class FileTypeParser { let payload = header.size - 24; if (_check(header.id, [0x91, 0x07, 0xDC, 0xB7, 0xB7, 0xA9, 0xCF, 0x11, 0x8E, 0xE6, 0x00, 0xC0, 0x0C, 0x20, 0x53, 0x65])) { // Sync on Stream-Properties-Object (B7DC0791-A9B7-11CF-8EE6-00C00C205365) - const typeId = Buffer.alloc(16); + const typeId = new Uint8Array(16); payload -= await tokenizer.readBuffer(typeId); if (_check(typeId, [0x40, 0x9E, 0x69, 0xF8, 0x4D, 0x5B, 0xCF, 0x11, 0xA8, 0xFD, 0x00, 0x80, 0x5F, 0x5C, 0x44, 0x2B])) { @@ -1432,10 +1439,11 @@ export class FileTypeParser { } if (this.check([0x04, 0x00, 0x00, 0x00]) && this.buffer.length >= 16) { // Rough & quick check Pickle/ASAR - const jsonSize = this.buffer.readUInt32LE(12); + const jsonSize = new DataView(this.buffer.buffer).getUint32(12, true); + if (jsonSize > 12 && this.buffer.length >= jsonSize + 16) { try { - const header = this.buffer.slice(16, jsonSize + 16).toString(); + const header = new TextDecoder().decode(this.buffer.slice(16, jsonSize + 16)); const json = JSON.parse(header); // Check if Pickle is ASAR if (json.files) { // Final check, assuring Pickle/ASAR format diff --git a/index.test-d.ts b/index.test-d.ts index 84225eb4..d37b2ccb 100644 --- a/index.test-d.ts +++ b/index.test-d.ts @@ -1,4 +1,3 @@ -import {Buffer} from 'node:buffer'; import {createReadStream} from 'node:fs'; import {expectType} from 'tsd'; import { @@ -18,12 +17,11 @@ import { type MimeType, } from './index.js'; -expectType>(fileTypeFromBuffer(Buffer.from([0xFF, 0xD8, 0xFF]))); expectType>(fileTypeFromBuffer(new Uint8Array([0xFF, 0xD8, 0xFF]))); expectType>(fileTypeFromBuffer(new ArrayBuffer(42))); (async () => { - const result = await fileTypeFromBuffer(Buffer.from([0xFF, 0xD8, 0xFF])); + const result = await fileTypeFromBuffer(new Uint8Array([0xFF, 0xD8, 0xFF])); if (result !== undefined) { expectType(result.ext); expectType(result.mime); diff --git a/package.json b/package.json index 78de61f7..a7c6eccf 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "file-type", "version": "19.0.0", - "description": "Detect the file type of a Buffer/Uint8Array/ArrayBuffer", + "description": "Detect the file type of a Uint8Array/ArrayBuffer", "license": "MIT", "repository": "sindresorhus/file-type", "funding": "https://github.com/sindresorhus/file-type?sponsor=1", @@ -211,8 +211,9 @@ ], "dependencies": { "readable-web-to-node-stream": "^3.0.2", - "strtok3": "^7.0.0", - "token-types": "^5.0.1" + "strtok3": "^7.1.0", + "token-types": "^6.0.0", + "uint8array-extras": "^1.3.0" }, "devDependencies": { "@tokenizer/token": "^0.3.0", diff --git a/readme.md b/readme.md index c05f40bd..bc1cc465 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,6 @@ # file-type -> Detect the file type of a Buffer/Uint8Array/ArrayBuffer +> Detect the file type of a Uint8Array/ArrayBuffer The file type is detected by checking the [magic number](https://en.wikipedia.org/wiki/Magic_number_(programming)#Magic_numbers_in_files) of the buffer. @@ -31,7 +31,7 @@ console.log(await fileTypeFromFile('Unicorn.png')); //=> {ext: 'png', mime: 'image/png'} ``` -Determine file type from a Buffer, which may be a portion of the beginning of a file: +Determine file type from a Uint8Array/ArrayBuffer, which may be a portion of the beginning of a file: ```js import {fileTypeFromBuffer} from 'file-type'; @@ -107,7 +107,7 @@ console.log(fileType); ### fileTypeFromBuffer(buffer) -Detect the file type of a `Buffer`, `Uint8Array`, or `ArrayBuffer`. +Detect the file type of a `Uint8Array`, or `ArrayBuffer`. The file type is detected by checking the [magic number](https://en.wikipedia.org/wiki/Magic_number_(programming)#Magic_numbers_in_files) of the buffer. @@ -122,7 +122,7 @@ Or `undefined` when there is no match. #### buffer -Type: `Buffer | Uint8Array | ArrayBuffer` +Type: `Uint8Array | ArrayBuffer` A buffer representing file data. It works best if the buffer contains the entire file. It may work with a smaller portion as well. @@ -335,7 +335,7 @@ const customDetectors = [ async tokenizer => { const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // 'UNICORN' as decimal string - const buffer = Buffer.alloc(7); + const buffer = new Uint8Array(7); await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); if (unicornHeader.every((value, index) => value === buffer[index])) { @@ -346,7 +346,7 @@ const customDetectors = [ }, ]; -const buffer = Buffer.from('UNICORN'); +const buffer = new Uint8Array(new TextEncoder().encode('UNICORN')); const parser = new FileTypeParser({customDetectors}); const fileType = await parser.fromBuffer(buffer); console.log(fileType); diff --git a/test.js b/test.js index db0e3a94..8d937afe 100644 --- a/test.js +++ b/test.js @@ -1,5 +1,4 @@ import process from 'node:process'; -import {Buffer} from 'node:buffer'; import path from 'node:path'; import {fileURLToPath} from 'node:url'; import fs from 'node:fs'; @@ -8,6 +7,7 @@ import test from 'ava'; import {readableNoopStream} from 'noop-stream'; import {Parser as ReadmeParser} from 'commonmark'; import * as strtok3 from 'strtok3/core'; +import {areUint8ArraysEqual} from 'uint8array-extras'; import { fileTypeFromBuffer, fileTypeFromStream, @@ -331,12 +331,23 @@ async function testFileFromStream(t, ext, name) { } async function loadEntireFile(readable) { - const buffer = []; + const chunks = []; + let totalLength = 0; + for await (const chunk of readable) { - buffer.push(Buffer.from(chunk)); + chunks.push(chunk); + totalLength += chunk.length; } - return Buffer.concat(buffer); + const entireFile = new Uint8Array(totalLength); + + let offset = 0; + for (const chunk of chunks) { + entireFile.set(new Uint8Array(chunk), offset); + offset += chunk.length; + } + + return entireFile; } async function testStream(t, ext, name) { @@ -348,7 +359,7 @@ async function testStream(t, ext, name) { const [bufferA, bufferB] = await Promise.all([loadEntireFile(readableStream), loadEntireFile(fileStream)]); - t.true(bufferA.equals(bufferB)); + t.true(areUint8ArraysEqual(bufferA, bufferB)); } let i = 0; @@ -387,7 +398,7 @@ test('.fileTypeStream() method - empty stream', async t => { }); test('.fileTypeStream() method - short stream', async t => { - const bufferA = Buffer.from([0, 1, 0, 1]); + const bufferA = new Uint8Array([0, 1, 0, 1]); class MyStream extends stream.Readable { _read() { this.push(bufferA); @@ -450,8 +461,6 @@ test('validate the input argument type', async t => { message: /Expected the `input` argument to be of type `Uint8Array`/, }); - await t.notThrowsAsync(fileTypeFromBuffer(Buffer.from('x'))); - await t.notThrowsAsync(fileTypeFromBuffer(new Uint8Array())); await t.notThrowsAsync(fileTypeFromBuffer(new ArrayBuffer())); @@ -620,12 +629,12 @@ test('odd file sizes', async t => { const oddFileSizes = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 255, 256, 257, 511, 512, 513]; for (const size of oddFileSizes) { - const buffer = Buffer.alloc(size); + const buffer = new Uint8Array(size); await t.notThrowsAsync(fileTypeFromBuffer(buffer), `fromBuffer: File size: ${size} bytes`); } for (const size of oddFileSizes) { - const buffer = Buffer.alloc(size); + const buffer = new Uint8Array(size); const stream = new BufferedStream(buffer); await t.notThrowsAsync(fileTypeFromStream(stream), `fromStream: File size: ${size} bytes`); } @@ -662,13 +671,13 @@ test('supported files types are listed alphabetically', async t => { test('corrupt MKV throws', async t => { const filePath = path.join(__dirname, 'fixture/fixture-corrupt.mkv'); - await t.throwsAsync(fileTypeFromFile(filePath), {message: /out of range/}); + await t.throwsAsync(fileTypeFromFile(filePath), {message: /End-Of-Stream/}); }); // Create a custom detector for the just made up "unicorn" file type const unicornDetector = async tokenizer => { const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string - const buffer = Buffer.alloc(7); + const buffer = new Uint8Array(7); await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); if (unicornHeader.every((value, index) => value === buffer[index])) { return {ext: 'unicorn', mime: 'application/unicorn'}; @@ -680,7 +689,7 @@ const unicornDetector = async tokenizer => { const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'}); const tokenizerPositionChanger = tokenizer => { - const buffer = Buffer.alloc(1); + const buffer = new Uint8Array(1); tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); }; diff --git a/util.js b/util.js index 96abe024..a7b825e2 100644 --- a/util.js +++ b/util.js @@ -1,3 +1,5 @@ +import {StringType} from 'token-types'; + export function stringToBytes(string) { return [...string].map(character => character.charCodeAt(0)); // eslint-disable-line unicorn/prefer-code-point } @@ -5,12 +7,12 @@ export function stringToBytes(string) { /** Checks whether the TAR checksum is valid. -@param {Buffer} buffer - The TAR header `[offset ... offset + 512]`. +@param {Uint8Array} arrayBuffer - The TAR header `[offset ... offset + 512]`. @param {number} offset - TAR header offset. @returns {boolean} `true` if the TAR checksum is valid, otherwise `false`. */ -export function tarHeaderChecksumMatches(buffer, offset = 0) { - const readSum = Number.parseInt(buffer.toString('utf8', 148, 154).replace(/\0.*$/, '').trim(), 8); // Read sum in header +export function tarHeaderChecksumMatches(arrayBuffer, offset = 0) { + const readSum = Number.parseInt(new StringType(6).get(arrayBuffer, 148).replace(/\0.*$/, '').trim(), 8); // Read sum in header if (Number.isNaN(readSum)) { return false; } @@ -18,11 +20,11 @@ export function tarHeaderChecksumMatches(buffer, offset = 0) { let sum = 8 * 0x20; // Initialize signed bit sum for (let index = offset; index < offset + 148; index++) { - sum += buffer[index]; + sum += arrayBuffer[index]; } for (let index = offset + 156; index < offset + 512; index++) { - sum += buffer[index]; + sum += arrayBuffer[index]; } return readSum === sum;