Skip to content

Commit

Permalink
Stew ro/use file type library for mime type validation (#636)
Browse files Browse the repository at this point in the history
* fix: zoomIn keyboar shortcut for macOS

* fix: appId

* fix: handle mime is undefined

* refactor or check lecmght + check for electron

* fix: use file-type library for mime type validation

* feat: use mime library to get mime type from extension

Co-authored-by: alex-krasn <v-alexkr@microsoft.com>
  • Loading branch information
stew-ro and alex-krasn authored Oct 7, 2020
1 parent 355ca0b commit 6d4e93b
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 51 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
"foreman": "^3.0.1",
"jquery": "^3.5.0",
"kind-of": "^6.0.3",
"mime": "^2.4.6",
"minimist": "^1.2.2",
"node-forge": "^0.10.0",
"node-sass": "^4.14.1",
Expand Down
9 changes: 9 additions & 0 deletions src/models/applicationState.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ export interface IAsset {
ocr?: any,
isRunningOCR?: boolean,
cachedImage?: string,
mimeType?: string,
}

/**
Expand Down Expand Up @@ -346,6 +347,14 @@ export enum AssetType {
TIFF = 6,
}

export enum AssetMimeType {
PDF = "application/pdf",
TIFF = "image/tiff",
JPG = "image/jpg",
PNG = "image/png",
BMP = "image/bmp",
}

/**
* @name - Asset State
* @description - Defines the state of the asset with regard to the tagging process
Expand Down
2 changes: 1 addition & 1 deletion src/react/components/pages/editorPage/canvas.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1158,7 +1158,7 @@ export default class Canvas extends React.Component<ICanvasProps, ICanvasState>
return;
}
try {
const ocr = await this.ocrService.getRecognizedText(asset.path, asset.name, this.setOCRStatus, force);
const ocr = await this.ocrService.getRecognizedText(asset.path, asset.name, asset.mimeType, this.setOCRStatus, force);
if (asset.id === this.state.currentAsset.asset.id) {
// since get OCR is async, we only set currentAsset's OCR
this.setState({
Expand Down
2 changes: 1 addition & 1 deletion src/react/components/pages/editorPage/editorPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ export default class EditorPage extends React.Component<IEditorPageProps, IEdito
if (asset && (asset.state === AssetState.NotVisited || runForAll)) {
try {
this.updateAssetState(asset.id, true);
await ocrService.getRecognizedText(asset.path, asset.name, undefined, runForAll);
await ocrService.getRecognizedText(asset.path, asset.name, asset.mimeType, undefined, runForAll);
this.updateAssetState(asset.id, false, AssetState.Visited);
} catch (err) {
this.updateAssetState(asset.id, false);
Expand Down
72 changes: 31 additions & 41 deletions src/services/assetService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import _ from "lodash";
import Guard from "../common/guard";
import {
IAsset, AssetType, IProject, IAssetMetadata, AssetState,
ILabelData, ILabel,
ILabelData, ILabel, AssetMimeType
} from "../models/applicationState";
import { AssetProviderFactory, IAssetProvider } from "../providers/storage/assetProviderFactory";
import { StorageProviderFactory, IStorageProvider } from "../providers/storage/storageProviderFactory";
Expand All @@ -16,6 +16,9 @@ import { strings, interpolate } from "../common/strings";
import { sha256Hash } from "../common/crypto";
import { toast } from "react-toastify";
import allSettled from "promise.allsettled"
import mime from 'mime';
import FileType from 'file-type';
import BrowserFileType from 'file-type/browser';

const supportedImageFormats = {
jpg: null, jpeg: null, null: null, png: null, bmp: null, tif: null, tiff: null, pdf: null,
Expand Down Expand Up @@ -175,26 +178,42 @@ export class AssetService {
// eslint-disable-next-line
const extensionParts = fileNameParts[fileNameParts.length - 1].split(/[\?#]/);
let assetFormat = extensionParts[0].toLowerCase();

let assetMimeType = mime.getType(assetFormat);
if (supportedImageFormats.hasOwnProperty(assetFormat)) {
let types;
let checkFileType;
let corruptFileName;
if (nodejsMode) {
const FileType = require('file-type');
const fileType = await FileType.fromFile(normalizedPath);
types = [fileType.ext];
try {
checkFileType = await FileType.fromFile(normalizedPath);
} catch {
// do nothing
}
corruptFileName = fileName.split(/[\\\/]/).pop().replace(/%20/g, " ");

} else {
types = await this.getMimeType(filePath);
try {
const getFetchSteam = (): Promise<Response> => this.pollForFetchAPI(() => fetch(filePath), 1000, 200);
const response = await getFetchSteam();
checkFileType = await BrowserFileType.fromStream(response.body);
} catch {
// do nothing
}
corruptFileName = fileName.split("%2F").pop().replace(/%20/g, " ");
}
if (!types) {
let fileType;
let mimeType;
if (checkFileType) {
fileType = checkFileType.ext;
mimeType = checkFileType.mime;
}

if (!fileType) {
console.error(interpolate(strings.editorPage.assetWarning.incorrectFileExtension.failedToFetch, { fileName: corruptFileName.toLocaleUpperCase() }));
}
// If file was renamed/spoofed - fix file extension to true MIME type and show message
else if (!types.includes(assetFormat)) {
assetFormat = types[0];
// If file was renamed/spoofed - fix file extension to true MIME if it's type is in supported file types and show message
else if (fileType !== assetFormat) {
assetFormat = fileType;
assetMimeType = mimeType;
console.error(`${strings.editorPage.assetWarning.incorrectFileExtension.attention} ${corruptFileName.toLocaleUpperCase()} ${strings.editorPage.assetWarning.incorrectFileExtension.text} ${corruptFileName.toLocaleUpperCase()}`);
}
}
Expand All @@ -209,6 +228,7 @@ export class AssetService {
name: fileName,
path: filePath,
size: null,
mimeType: assetMimeType,
};
}

Expand All @@ -233,36 +253,6 @@ export class AssetService {
}
}

// If extension of a file was spoofed, we fetch only first 4 or needed amount of bytes of the file and read MIME type
public static async getMimeType(uri: string): Promise<string[]> {
const getFirst4bytes = (): Promise<Response> => this.pollForFetchAPI(() => fetch(uri, { headers: { range: `bytes=0-${mimeBytesNeeded}` } }), 1000, 200);
let first4bytes: Response;
try {
first4bytes = await getFirst4bytes()
} catch {
return new Promise<string[]>((resolve) => {
resolve(null);
});
}
const arrayBuffer: ArrayBuffer = await first4bytes.arrayBuffer();
const blob: Blob = new Blob([new Uint8Array(arrayBuffer).buffer]);
const isMime = (bytes: Uint8Array, mime: IMime): boolean => {
return mime.pattern.every((p, i) => !p || bytes[i] === p);
};
const fileReader: FileReader = new FileReader();

return new Promise<string[]>((resolve, reject) => {
fileReader.onloadend = (e) => {
if (!e || !fileReader.result) {
return [];
}
const bytes: Uint8Array = new Uint8Array(fileReader.result as ArrayBuffer);
const type: string[] = imageMimes.filter((mime) => isMime(bytes, mime))?.[0]?.types;
resolve(type || []);
};
fileReader.readAsArrayBuffer(blob);
});
}

private assetProviderInstance: IAssetProvider;
private storageProviderInstance: IStorageProvider;
Expand Down
13 changes: 6 additions & 7 deletions src/services/ocrService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export class OCRService {
public async getRecognizedText(
filePath: string,
fileName: string,
mimeType: string,
onStatusChanged?: (ocrStatus: OcrStatus) => void,
rewrite?: boolean
): Promise<any> {
Expand All @@ -47,11 +48,11 @@ export class OCRService {
notifyStatusChanged(OcrStatus.loadingFromAzureBlob);
ocrJson = await this.readOcrFile(ocrFileName);
if (!this.isValidOcrFormat(ocrJson) || rewrite) {
ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName);
ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName, mimeType);
}
} catch (e) {
notifyStatusChanged(OcrStatus.runningOCR);
ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName);
ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName, mimeType);
} finally {
notifyStatusChanged(OcrStatus.done);
}
Expand Down Expand Up @@ -81,7 +82,7 @@ export class OCRService {
}
}

private fetchOcrUriResult = async (filePath: string, fileName: string, ocrFileName: string) => {
private fetchOcrUriResult = async (filePath: string, fileName: string, ocrFileName: string, mimeType: string) => {
try {
let body;
let headers;
Expand All @@ -93,10 +94,8 @@ export class OCRService {
]
);
body = bodyAndType[0];
const fileType = bodyAndType[1].mime;
headers = { "Content-Type": fileType, "cache-control": "no-cache" };
}
else {
headers = { "Content-Type": mimeType, "cache-control": "no-cache" };
} else {
body = { url: filePath };
headers = { "Content-Type": "application/json" };
}
Expand Down
2 changes: 1 addition & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8438,7 +8438,7 @@ mime@1.6.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1"
integrity sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==

mime@^2.4.4, mime@^2.4.5:
mime@^2.4.4, mime@^2.4.5, mime@^2.4.6:
version "2.4.6"
resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1"
integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA==
Expand Down

0 comments on commit 6d4e93b

Please sign in to comment.