Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add detecting Web Stream to default (core) entry point #649

Merged
merged 14 commits into from
Aug 2, 2024
Merged
26 changes: 26 additions & 0 deletions core.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,27 @@ export declare class TokenizerPositionError extends Error {
constructor(message?: string);
}

export type AnyWebReadableByteStreamWithFileType = AnyWebReadableStream<Uint8Array> & {
readonly fileType?: FileTypeResult;
};

/**
Returns a `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.

This method can be handy to put in between a stream, but it comes with a price.
Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample, to determine the file type.
The sample size impacts the file detection resolution.
A smaller sample size will result in lower probability of the best file type detection.

**Note:** This method is only available when using Node.js.
**Note:** Requires Node.js 14 or later.

@param webStream - A Web Stream
@param options - Maybe used to override the default sample-size.
@returns A `Promise` which resolves to the original web stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.
*/
export function fileTypeStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;

export declare class FileTypeParser {
detectors: Iterable<Detector>;

Expand All @@ -494,4 +515,9 @@ export declare class FileTypeParser {
Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
fromBlob(blob: Blob): Promise<FileTypeResult | undefined>;

/**
Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
toDetectionStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;
}
48 changes: 48 additions & 0 deletions core.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ export async function fileTypeFromTokenizer(tokenizer) {
return new FileTypeParser().fromTokenizer(tokenizer);
}

export async function stream(webStream) {
Borewit marked this conversation as resolved.
Show resolved Hide resolved
return new FileTypeParser().toDetectionStream(webStream);
}

export class FileTypeParser {
constructor(options) {
this.detectors = options?.customDetectors;
Expand Down Expand Up @@ -104,6 +108,50 @@ export class FileTypeParser {
}
}

async toDetectionStream(webStream, options = {}) {
Borewit marked this conversation as resolved.
Show resolved Hide resolved
const {sampleSize = reasonableDetectionSizeInBytes} = options;

// Initialize a reader from the web stream
const reader = webStream.getReader({mode: 'byob'});
const pass = new TransformStream();
const writer = pass.writable.getWriter();
let detectedFileType;

// Read the first chunk for file type detection
const {value: chunk, done} = await reader.read(new Uint8Array(sampleSize));
if (done || !chunk) {
detectedFileType = undefined;
} else {
try {
detectedFileType = await this.fromBuffer(chunk.slice(0, sampleSize));
} catch (error) {
if (error instanceof strtok3.EndOfStreamError) {
detectedFileType = undefined;
} else {
throw error;
}
}
}

// Write the initial chunk into the pass-through stream
writer.write(chunk);

// Forward remaining data from the reader to the writer
(async function pump() {
const {value, done} = await reader.read(new Uint8Array(512 * 1024));
Borewit marked this conversation as resolved.
Show resolved Hide resolved
if (done) {
return writer.close();
}

await writer.write(value);
return pump();
Borewit marked this conversation as resolved.
Show resolved Hide resolved
})();

// Attach the detected file type to the output stream
pass.readable.fileType = detectedFileType;
return pass.readable;
}

check(header, options) {
return _check(this.buffer, header, options);
}
Expand Down
8 changes: 5 additions & 3 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Typings for Node.js specific entry point.
*/

import type {Readable as NodeReadableStream} from 'node:stream';
import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector} from './core.js';
import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector, AnyWebReadableByteStreamWithFileType} from './core.js';
import {FileTypeParser} from './core.js';

export type ReadableStreamWithFileType = NodeReadableStream & {
Expand All @@ -27,6 +27,7 @@ export declare class NodeFileTypeParser extends FileTypeParser {
Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
toDetectionStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise<ReadableStreamWithFileType>;
toDetectionStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;
}

/**
Expand Down Expand Up @@ -70,10 +71,10 @@ A smaller sample size will result in lower probability of the best file type det
**Note:** Requires Node.js 14 or later.

@param readableStream - A [readable stream](https://nodejs.org/api/stream.html#stream_class_stream_readable) containing a file to examine.
@param options - Maybe used to override the default sample-size.
@returns A `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.
@param options - Maybe used to override the default sample-size.with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.

@example
@returns A `Promise` which resolves to the original readable stream argument, but
Borewit marked this conversation as resolved.
Show resolved Hide resolved
```
import got from 'got';
import {fileTypeStream} from 'file-type';
Expand All @@ -89,5 +90,6 @@ if (stream2.fileType?.mime === 'image/jpeg') {
```
*/
export function fileTypeStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise<ReadableStreamWithFileType>;
export function fileTypeStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;

export * from './core.js';
10 changes: 7 additions & 3 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Node.js specific entry point.
*/

import {ReadableStream as WebReadableStream} from 'node:stream/web';
import {pipeline, PassThrough} from 'node:stream';
import * as strtok3 from 'strtok3';
import {FileTypeParser, reasonableDetectionSizeInBytes} from './core.js';

Expand All @@ -26,7 +27,10 @@ export class NodeFileTypeParser extends FileTypeParser {
}

async toDetectionStream(readableStream, options = {}) {
const {default: stream} = await import('node:stream');
if (readableStream instanceof WebReadableStream) {
return super.toDetectionStream(readableStream, options);
}

const {sampleSize = reasonableDetectionSizeInBytes} = options;

return new Promise((resolve, reject) => {
Expand All @@ -36,8 +40,8 @@ export class NodeFileTypeParser extends FileTypeParser {
(async () => {
try {
// Set up output stream
const pass = new stream.PassThrough();
const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);
const pass = new PassThrough();
const outputStream = pipeline ? pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);
sindresorhus marked this conversation as resolved.
Show resolved Hide resolved

// Read the input stream and detect the filetype
const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? new Uint8Array(0);
Expand Down
60 changes: 51 additions & 9 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import process from 'node:process';
import path from 'node:path';
import {fileURLToPath} from 'node:url';
import fs from 'node:fs';
import {readFile} from 'node:fs/promises';
import stream from 'node:stream';
import test from 'ava';
import {readableNoopStream} from 'noop-stream';
Expand All @@ -26,7 +27,7 @@ const missingTests = new Set([
]);

const [nodeMajorVersion] = process.versions.node.split('.').map(Number);
const nodeVersionSupportingByeBlobStream = 20;
const nodeVersionSupportingByteBlobStream = 20;

const types = [...supportedExtensions].filter(ext => !missingTests.has(ext));

Expand Down Expand Up @@ -337,7 +338,7 @@ async function testFileNodeFromStream(t, ext, name) {
t.is(typeof fileType.mime, 'string', 'fileType.mime');
}

async function loadEntireFile(readable) {
async function loadEntireFileFromNodeReadable(readable) {
Borewit marked this conversation as resolved.
Show resolved Hide resolved
const chunks = [];
let totalLength = 0;

Expand All @@ -357,18 +358,58 @@ async function loadEntireFile(readable) {
return entireFile;
}

async function testStream(t, ext, name) {
async function testStreamWithNodeStream(t, ext, name) {
const fixtureName = `${(name ?? 'fixture')}.${ext}`;
const file = path.join(__dirname, 'fixture', fixtureName);

const readableStream = await fileTypeStream(fs.createReadStream(file));
const fileStream = fs.createReadStream(file);

const [bufferA, bufferB] = await Promise.all([loadEntireFile(readableStream), loadEntireFile(fileStream)]);
const [bufferA, bufferB] = await Promise.all([loadEntireFileFromNodeReadable(readableStream), loadEntireFileFromNodeReadable(fileStream)]);

t.true(areUint8ArraysEqual(bufferA, bufferB));
}

async function loadEntireFileFromWebStream(webStream) {
const reader = webStream.getReader();
const chunks = [];
let totalLength = 0;
let bytesRead = 0;

do {
const {done, value} = await reader.read();
if (done) {
break;
}

chunks.push(value);
bytesRead = value.byteLength;
totalLength += bytesRead;
} while (bytesRead > 0);

// Concatenate all chunks into a single Uint8Array
const entireFile = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
entireFile.set(chunk, offset);
offset += chunk.byteLength;
}

return entireFile;
}

async function testStreamWithWebStream(t, ext, name) {
const fixtureName = `${(name ?? 'fixture')}.${ext}`;
const file = path.join(__dirname, 'fixture', fixtureName);
// Read the file into a buffer
const fileBuffer = await readFile(file);
// Create a Blob from the buffer
const blob = new Blob([fileBuffer]);
const webStream = await fileTypeStream(blob.stream());
const webStreamResult = await loadEntireFileFromWebStream(webStream);
t.true(areUint8ArraysEqual(fileBuffer, webStreamResult));
}

test('Test suite must be able to detect Node.js major version', t => {
t.is(typeof nodeMajorVersion, 'number', 'Detected Node.js major version should be a number');
});
Expand All @@ -382,13 +423,14 @@ for (const type of types) {

_test(`${name}.${type} ${i++} .fileTypeFromFile() method - same fileType`, testFromFile, type, name);
_test(`${name}.${type} ${i++} .fileTypeFromBuffer() method - same fileType`, testFromBuffer, type, name);
if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) {
if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
// Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20
_test(`${name}.${type} ${i++} .fileTypeFromBlob() method - same fileType`, testFromBlob, type, name);
test(`${name}.${type} ${i++} .fileTypeStream() - identical Web Streams`, testStreamWithWebStream, type, name);
}

_test(`${name}.${type} ${i++} .fileTypeFromStream() Node.js method - same fileType`, testFileNodeFromStream, type, name);
test(`${name}.${type} ${i++} .fileTypeStream() - identical streams`, testStream, type, name);
_test(`${name}.${type} ${i++} .fileTypeStream() - identical Node.js Readable streams`, testStreamWithNodeStream, type, name);
}
} else {
const fixtureName = `fixture.${type}`;
Expand All @@ -397,7 +439,7 @@ for (const type of types) {
_test(`${type} ${i++} .fileTypeFromFile()`, testFromFile, type);
_test(`${type} ${i++} .fileTypeFromBuffer()`, testFromBuffer, type);
_test(`${type} ${i++} .fileTypeFromStream() Node.js`, testFileNodeFromStream, type);
test(`${type} ${i++} .fileTypeStream() - identical streams`, testStream, type);
test(`${type} ${i++} .fileTypeStream() - identical streams`, testStreamWithNodeStream, type);
}

if (Object.prototype.hasOwnProperty.call(falsePositives, type)) {
Expand Down Expand Up @@ -427,7 +469,7 @@ test('.fileTypeStream() method - short stream', async t => {
t.is(newStream.fileType, undefined);

// Test usability of returned stream
const bufferB = await loadEntireFile(newStream);
const bufferB = await loadEntireFileFromNodeReadable(newStream);
t.deepEqual(bufferA, bufferB);
});

Expand Down Expand Up @@ -708,7 +750,7 @@ const tokenizerPositionChanger = tokenizer => {
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
};

if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) {
if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
// Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20

test('fileTypeFromBlob should detect custom file type "unicorn" using custom detectors', async t => {
Expand Down