src/vs/base/node/encoding.ts

/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the MIT License. See License.txt in the project root for license information.
 *--------------------------------------------------------------------------------------------*/

'use strict';

import * as stream from 'vs/base/node/stream';
import * as iconv from 'iconv-lite';
import { TPromise } from 'vs/base/common/winjs.base';
import { isLinux, isMacintosh } from 'vs/base/common/platform';
import { exec } from 'child_process';
import { Readable, Writable, WritableOptions } from 'stream';
import { asWinJSImport } from 'vs/base/common/async';

export const UTF8 = 'utf8';
export const UTF8_with_bom = 'utf8bom';
export const UTF16be = 'utf16be';
export const UTF16le = 'utf16le';

export interface IDecodeStreamOptions {
	guessEncoding?: boolean;
	minBytesRequiredForDetection?: number;
	overwriteEncoding?(detectedEncoding: string): string;
}

export function toDecodeStream(readable: Readable, options: IDecodeStreamOptions): TPromise<{ detected: IDetectedEncodingResult, stream: NodeJS.ReadableStream }> {

	if (!options.minBytesRequiredForDetection) {
		options.minBytesRequiredForDetection = options.guessEncoding ? AUTO_GUESS_BUFFER_MAX_LEN : NO_GUESS_BUFFER_MAX_LEN;
	}

	if (!options.overwriteEncoding) {
		options.overwriteEncoding = detected => detected || UTF8;
	}

	return new TPromise<{ detected: IDetectedEncodingResult, stream: NodeJS.ReadableStream }>((resolve, reject) => {
		readable.pipe(new class extends Writable {

			private _decodeStream: NodeJS.ReadWriteStream;
			private _decodeStreamConstruction: Thenable<any>;
			private _buffer: Buffer[] = [];
			private _bytesBuffered = 0;

			constructor(opts?: WritableOptions) {
				super(opts);
				this.once('finish', () => this._finish());
			}

			_write(chunk: any, encoding: string, callback: Function): void {
				if (!Buffer.isBuffer(chunk)) {
					callback(new Error('data must be a buffer'));
				}

				if (this._decodeStream) {
					// just a forwarder now
					this._decodeStream.write(chunk, callback);
					return;
				}

				this._buffer.push(chunk);
				this._bytesBuffered += chunk.length;

				if (this._decodeStreamConstruction) {
					// waiting for the decoder to be ready
					this._decodeStreamConstruction.then(_ => callback(), err => callback(err));

				} else if (this._bytesBuffered >= options.minBytesRequiredForDetection) {
					// buffered enough data, create stream and forward data
					this._startDecodeStream(callback);

				} else {
					// only buffering
					callback();
				}
			}

			_startDecodeStream(callback: Function): void {

				this._decodeStreamConstruction = TPromise.as(detectEncodingFromBuffer({
					buffer: Buffer.concat(this._buffer), bytesRead: this._bytesBuffered
				}, options.guessEncoding)).then(detected => {
					detected.encoding = options.overwriteEncoding(detected.encoding);
					this._decodeStream = decodeStream(detected.encoding);
					for (const buffer of this._buffer) {
						this._decodeStream.write(buffer);
					}
					callback();
					resolve({ detected, stream: this._decodeStream });

				}, err => {
					this.emit('error', err);
					callback(err);
				});
			}

			_finish(): void {
				if (this._decodeStream) {
					// normal finish
					this._decodeStream.end();
				} else {
					// we were still waiting for data...
					this._startDecodeStream(() => this._decodeStream.end());
				}
			}
		});
	});
}

export function bomLength(encoding: string): number {
	switch (encoding) {
		case UTF8:
			return 3;
		case UTF16be:
		case UTF16le:
			return 2;
	}

	return 0;
}

export function decode(buffer: NodeBuffer, encoding: string): string {
	return iconv.decode(buffer, toNodeEncoding(encoding));
}

export function encode(content: string | NodeBuffer, encoding: string, options?: { addBOM?: boolean }): NodeBuffer {
	return iconv.encode(content, toNodeEncoding(encoding), options);
}

export function encodingExists(encoding: string): boolean {
	return iconv.encodingExists(toNodeEncoding(encoding));
}

export function decodeStream(encoding: string): NodeJS.ReadWriteStream {
	return iconv.decodeStream(toNodeEncoding(encoding));
}

export function encodeStream(encoding: string, options?: { addBOM?: boolean }): NodeJS.ReadWriteStream {
	return iconv.encodeStream(toNodeEncoding(encoding), options);
}

function toNodeEncoding(enc: string): string {
	if (enc === UTF8_with_bom) {
		return UTF8; // iconv does not distinguish UTF 8 with or without BOM, so we need to help it
	}

	return enc;
}

export function detectEncodingByBOMFromBuffer(buffer: NodeBuffer, bytesRead: number): string {
	if (!buffer || bytesRead < 2) {
		return null;
	}

	const b0 = buffer.readUInt8(0);
	const b1 = buffer.readUInt8(1);

	// UTF-16 BE
	if (b0 === 0xFE && b1 === 0xFF) {
		return UTF16be;
	}

	// UTF-16 LE
	if (b0 === 0xFF && b1 === 0xFE) {
		return UTF16le;
	}

	if (bytesRead < 3) {
		return null;
	}

	const b2 = buffer.readUInt8(2);

	// UTF-8
	if (b0 === 0xEF && b1 === 0xBB && b2 === 0xBF) {
		return UTF8;
	}

	return null;
}

/**
 * Detects the Byte Order Mark in a given file.
 * If no BOM is detected, null will be passed to callback.
 */
export function detectEncodingByBOM(file: string): TPromise<string> {
	return stream.readExactlyByFile(file, 3).then(({ buffer, bytesRead }) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
}

const MINIMUM_THRESHOLD = 0.2;
const IGNORE_ENCODINGS = ['ascii', 'utf-8', 'utf-16', 'utf-32'];

/**
 * Guesses the encoding from buffer.
 */
export function guessEncodingByBuffer(buffer: NodeBuffer): TPromise<string> {
	return asWinJSImport(import('jschardet')).then(jschardet => {
		jschardet.Constants.MINIMUM_THRESHOLD = MINIMUM_THRESHOLD;

		const guessed = jschardet.detect(buffer);
		if (!guessed || !guessed.encoding) {
			return null;
		}

		const enc = guessed.encoding.toLowerCase();

		// Ignore encodings that cannot guess correctly
		// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
		if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
			return null;
		}

		return toIconvLiteEncoding(guessed.encoding);
	});
}

const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = {
	'ibm866': 'cp866',
	'big5': 'cp950'
};

function toIconvLiteEncoding(encodingName: string): string {
	const normalizedEncodingName = encodingName.replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
	const mapped = JSCHARDET_TO_ICONV_ENCODINGS[normalizedEncodingName];

	return mapped || normalizedEncodingName;
}

/**
 * The encodings that are allowed in a settings file don't match the canonical encoding labels specified by WHATWG.
 * See https://encoding.spec.whatwg.org/#names-and-labels
 * Iconv-lite strips all non-alphanumeric characters, but ripgrep doesn't. For backcompat, allow these labels.
 */
export function toCanonicalName(enc: string): string {
	switch (enc) {
		case 'shiftjis':
			return 'shift-jis';
		case 'utf16le':
			return 'utf-16le';
		case 'utf16be':
			return 'utf-16be';
		case 'big5hkscs':
			return 'big5-hkscs';
		case 'eucjp':
			return 'euc-jp';
		case 'euckr':
			return 'euc-kr';
		case 'koi8r':
			return 'koi8-r';
		case 'koi8u':
			return 'koi8-u';
		case 'macroman':
			return 'x-mac-roman';
		case 'utf8bom':
			return 'utf8';
		default:
			const m = enc.match(/windows(\d+)/);
			if (m) {
				return 'windows-' + m[1];
			}

			return enc;
	}
}

const ZERO_BYTE_DETECTION_BUFFER_MAX_LEN = 512; // number of bytes to look at to decide about a file being binary or not
const NO_GUESS_BUFFER_MAX_LEN = 512; 			// when not auto guessing the encoding, small number of bytes are enough
const AUTO_GUESS_BUFFER_MAX_LEN = 512 * 8; 		// with auto guessing we want a lot more content to be read for guessing

export interface IDetectedEncodingResult {
	encoding: string;
	seemsBinary: boolean;
}

export interface DetectEncodingOption {
	autoGuessEncoding?: boolean;
}

export function detectEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: false): IDetectedEncodingResult;
export function detectEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IDetectedEncodingResult>;
export function detectEncodingFromBuffer({ buffer, bytesRead }: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IDetectedEncodingResult> | IDetectedEncodingResult {

	// Always first check for BOM to find out about encoding
	let encoding = detectEncodingByBOMFromBuffer(buffer, bytesRead);

	// Detect 0 bytes to see if file is binary or UTF-16 LE/BE
	// unless we already know that this file has a UTF-16 encoding
	let seemsBinary = false;
	if (encoding !== UTF16be && encoding !== UTF16le) {
		let couldBeUTF16LE = true; // e.g. 0xAA 0x00
		let couldBeUTF16BE = true; // e.g. 0x00 0xAA
		let containsZeroByte = false;

		// This is a simplified guess to detect UTF-16 BE or LE by just checking if
		// the first 512 bytes have the 0-byte at a specific location. For UTF-16 LE
		// this would be the odd byte index and for UTF-16 BE the even one.
		// Note: this can produce false positives (a binary file that uses a 2-byte
		// encoding of the same format as UTF-16) and false negatives (a UTF-16 file
		// that is using 4 bytes to encode a character).
		for (let i = 0; i < bytesRead && i < ZERO_BYTE_DETECTION_BUFFER_MAX_LEN; i++) {
			const isEndian = (i % 2 === 1); // assume 2-byte sequences typical for UTF-16
			const isZeroByte = (buffer.readInt8(i) === 0);

			if (isZeroByte) {
				containsZeroByte = true;
			}

			// UTF-16 LE: expect e.g. 0xAA 0x00
			if (couldBeUTF16LE && (isEndian && !isZeroByte || !isEndian && isZeroByte)) {
				couldBeUTF16LE = false;
			}

			// UTF-16 BE: expect e.g. 0x00 0xAA
			if (couldBeUTF16BE && (isEndian && isZeroByte || !isEndian && !isZeroByte)) {
				couldBeUTF16BE = false;
			}

			// Return if this is neither UTF16-LE nor UTF16-BE and thus treat as binary
			if (isZeroByte && !couldBeUTF16LE && !couldBeUTF16BE) {
				break;
			}
		}

		// Handle case of 0-byte included
		if (containsZeroByte) {
			if (couldBeUTF16LE) {
				encoding = UTF16le;
			} else if (couldBeUTF16BE) {
				encoding = UTF16be;
			} else {
				seemsBinary = true;
			}
		}
	}

	// Auto guess encoding if configured
	if (autoGuessEncoding && !seemsBinary && !encoding) {
		return guessEncodingByBuffer(buffer.slice(0, bytesRead)).then(encoding => {
			return {
				seemsBinary: false,
				encoding
			};
		});
	}

	return { seemsBinary, encoding };
}

// https://ss64.com/nt/chcp.html
const windowsTerminalEncodings = {
	'437': 'cp437', // United States
	'850': 'cp850', // Multilingual(Latin I)
	'852': 'cp852', // Slavic(Latin II)
	'855': 'cp855', // Cyrillic(Russian)
	'857': 'cp857', // Turkish
	'860': 'cp860', // Portuguese
	'861': 'cp861', // Icelandic
	'863': 'cp863', // Canadian - French
	'865': 'cp865', // Nordic
	'866': 'cp866', // Russian
	'869': 'cp869', // Modern Greek
	'936': 'cp936', // Simplified Chinese
	'1252': 'cp1252' // West European Latin
};

export function resolveTerminalEncoding(verbose?: boolean): TPromise<string> {
	let rawEncodingPromise: TPromise<string>;

	// Support a global environment variable to win over other mechanics
	const cliEncodingEnv = process.env['VSCODE_CLI_ENCODING'];
	if (cliEncodingEnv) {
		if (verbose) {
			console.log(`Found VSCODE_CLI_ENCODING variable: ${cliEncodingEnv}`);
		}

		rawEncodingPromise = TPromise.as(cliEncodingEnv);
	}

	// Linux/Mac: use "locale charmap" command
	else if (isLinux || isMacintosh) {
		rawEncodingPromise = new TPromise<string>(c => {
			if (verbose) {
				console.log('Running "locale charmap" to detect terminal encoding...');
			}

			exec('locale charmap', (err, stdout, stderr) => c(stdout));
		});
	}

	// Windows: educated guess
	else {
		rawEncodingPromise = new TPromise<string>(c => {
			if (verbose) {
				console.log('Running "chcp" to detect terminal encoding...');
			}

			exec('chcp', (err, stdout, stderr) => {
				if (stdout) {
					const windowsTerminalEncodingKeys = Object.keys(windowsTerminalEncodings);
					for (let i = 0; i < windowsTerminalEncodingKeys.length; i++) {
						const key = windowsTerminalEncodingKeys[i];
						if (stdout.indexOf(key) >= 0) {
							return c(windowsTerminalEncodings[key]);
						}
					}
				}

				return c(void 0);
			});
		});
	}

	return rawEncodingPromise.then(rawEncoding => {
		if (verbose) {
			console.log(`Detected raw terminal encoding: ${rawEncoding}`);
		}

		if (!rawEncoding || rawEncoding.toLowerCase() === 'utf-8' || rawEncoding.toLowerCase() === UTF8) {
			return UTF8;
		}

		const iconvEncoding = toIconvLiteEncoding(rawEncoding);
		if (iconv.encodingExists(iconvEncoding)) {
			return iconvEncoding;
		}

		if (verbose) {
			console.log('Unsupported terminal encoding, falling back to UTF-8.');
		}

		return UTF8;
	});
}