Skip to content

Commit

Permalink
Imporive small-document language prediction
Browse files Browse the repository at this point in the history
Closes #137104
  • Loading branch information
Jackson Kearl committed Feb 21, 2022
1 parent f2b407d commit 329edab
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 21 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "code-oss-dev",
"version": "1.65.0",
"distro": "c8556745c78d78e442ea9920b12967b8cf9c85d4",
"distro": "fd1f169ed85b1c177dbab136eff2c97fdc546f9f",
"author": {
"name": "Microsoft Corporation"
},
Expand Down
3 changes: 2 additions & 1 deletion src/bootstrap-window.js
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,15 @@
'@vscode/iconv-lite-umd': `${baseNodeModulesPath}/@vscode/iconv-lite-umd/lib/iconv-lite-umd.js`,
'jschardet': `${baseNodeModulesPath}/jschardet/dist/jschardet.min.js`,
'@vscode/vscode-languagedetection': `${baseNodeModulesPath}/@vscode/vscode-languagedetection/dist/lib/index.js`,
'vscode-regexp-languagedetection': `${baseNodeModulesPath}/vscode-regexp-languagedetection/dist/index.js`,
'tas-client-umd': `${baseNodeModulesPath}/tas-client-umd/lib/tas-client-umd.js`
};

// Allow to load built-in and other node.js modules via AMD
// which has a fallback to using node.js `require`
// (node.js enabled renderers only)
if (!safeProcess.sandboxed) {
loaderConfig.amdModulesPattern = /(^vs\/)|(^vscode-textmate$)|(^vscode-oniguruma$)|(^xterm$)|(^xterm-addon-search$)|(^xterm-addon-unicode11$)|(^xterm-addon-webgl$)|(^@vscode\/iconv-lite-umd$)|(^jschardet$)|(^@vscode\/vscode-languagedetection$)|(^tas-client-umd$)/;
loaderConfig.amdModulesPattern = /(^vs\/)|(^vscode-textmate$)|(^vscode-oniguruma$)|(^xterm$)|(^xterm-addon-search$)|(^xterm-addon-unicode11$)|(^xterm-addon-webgl$)|(^@vscode\/iconv-lite-umd$)|(^jschardet$)|(^@vscode\/vscode-languagedetection$)|(^vscode-regexp-languagedetection$)|(^tas-client-umd$)/;
}

// Signal before require.config()
Expand Down
9 changes: 9 additions & 0 deletions src/vs/workbench/browser/workbench.contribution.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ const registry = Registry.as<IConfigurationRegistry>(ConfigurationExtensions.Con
description: localize('workbench.editor.languageDetection', "Controls whether the language in a text editor is automatically detected unless the language has been explicitly set by the language picker. This can also be scoped by language so you can specify which languages you do not want to be switched off of. This is useful for languages like Markdown that often contain other languages that might trick language detection into thinking it's the embedded language and not Markdown."),
scope: ConfigurationScope.LANGUAGE_OVERRIDABLE
},
'workbench.editor.languageDetectionPreferredLanguages': {
type: 'array',
default: ['cpp', 'csharp', 'css', 'html', 'java', 'javascript', 'json', 'markdown', 'php', 'python', 'typescript', 'yaml',],
items: {
type: 'string',
enum: ['bat', 'c', 'coffeescript', 'cpp', 'csharp', 'css', 'go', 'html', 'java', 'javascript', 'json', 'lua', 'markdown', 'objective-c', 'perl', 'php', 'powershell', 'python', 'r', 'ruby', 'rust', 'scala', 'sh', 'sql', 'swift', 'typescript', 'yaml',],
},
description: localize('workbench.editor.languageDetectionPreferredLanguages', "Configures languages automatic language detection will prefer to select for a given document. This applies primarily to short documents where there is insufficient data to confidently predict a language."),
},
'workbench.editor.tabCloseButton': {
'type': 'string',
'enum': ['left', 'right', 'off'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import { IRequestHandler } from 'vs/base/common/worker/simpleWorker';
import { EditorSimpleWorker } from 'vs/editor/common/services/editorSimpleWorker';
import { IEditorWorkerHost } from 'vs/editor/common/services/editorWorkerHost';

type RegexpModel = { detect: (inp: string, potentialLangs: string[]) => string | undefined };

/**
* Called on the worker side
* @internal
Expand All @@ -26,14 +28,20 @@ export class LanguageDetectionSimpleWorker extends EditorSimpleWorker {
private static readonly positiveConfidenceCorrectionBucket2 = 0.025;
private static readonly negativeConfidenceCorrection = 0.5;

private _regexpModel: RegexpModel | undefined;
private _regexpLoadFailed: boolean = false;

private _modelOperations: ModelOperations | undefined;
private _loadFailed: boolean = false;

public async detectLanguage(uri: string): Promise<string | undefined> {
public async detectLanguage(uri: string, userPreferredLanguages: string[]): Promise<string | undefined> {
const languages: string[] = [];
const confidences: number[] = [];
const stopWatch = new StopWatch(true);
for await (const language of this.detectLanguagesImpl(uri)) {
const documentTextSample = this.getTextForDetection(uri);
if (!documentTextSample) { return; }

for await (const language of this.detectLanguagesImpl(documentTextSample)) {
languages.push(language.languageId);
confidences.push(language.confidence);
}
Expand All @@ -43,9 +51,55 @@ export class LanguageDetectionSimpleWorker extends EditorSimpleWorker {
this._host.fhr('sendTelemetryEvent', [languages, confidences, stopWatch.elapsed()]);
return languages[0];
}

const regexpDetection = await this.runRegexpModel(documentTextSample, userPreferredLanguages);
if (regexpDetection) {
return regexpDetection;
}

return undefined;
}

private getTextForDetection(uri: string): string | undefined {
const editorModel = this._getModel(uri);
if (!editorModel) { return; }

const end = editorModel.positionAt(10000);
const content = editorModel.getValueInRange({
startColumn: 1,
startLineNumber: 1,
endColumn: end.column,
endLineNumber: end.lineNumber
});
return content;
}

private async getRegexpModel(): Promise<RegexpModel | undefined> {
if (this._regexpLoadFailed) {
return;
}
if (this._regexpModel) {
return this._regexpModel;
}
const uri: string = await this._host.fhr('getRegexpModelUri', []);
try {
this._regexpModel = await import(uri) as RegexpModel;
return this._regexpModel;
} catch (e) {
this._regexpLoadFailed = true;
console.warn('error loading language detection model', e);
return;
}
}

private async runRegexpModel(content: string, userPreferredLanguages: string[]): Promise<string | undefined> {
const regexpModel = await this.getRegexpModel();
if (!regexpModel) { return; }

const detected = regexpModel.detect(content, userPreferredLanguages);
return detected;
}

private async getModelOperations(): Promise<ModelOperations> {
if (this._modelOperations) {
return this._modelOperations;
Expand Down Expand Up @@ -127,7 +181,7 @@ export class LanguageDetectionSimpleWorker extends EditorSimpleWorker {
return modelResult;
}

private async * detectLanguagesImpl(uri: string): AsyncGenerator<ModelResult, void, unknown> {
private async * detectLanguagesImpl(content: string): AsyncGenerator<ModelResult, void, unknown> {
if (this._loadFailed) {
return;
}
Expand All @@ -141,20 +195,8 @@ export class LanguageDetectionSimpleWorker extends EditorSimpleWorker {
return;
}

const model = this._getModel(uri);
if (!model) {
return;
}

let modelResults: ModelResult[] | undefined;
// Grab the first 10000 characters
const end = model.positionAt(10000);
const content = model.getValueInRange({
startColumn: 1,
startLineNumber: 1,
endColumn: end.column,
endLineNumber: end.lineNumber
});

try {
modelResults = await modelOperations.runModel(content);
} catch (e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ import { ITelemetryService } from 'vs/platform/telemetry/common/telemetry';
import { EditorWorkerClient, EditorWorkerHost } from 'vs/editor/browser/services/editorWorkerService';
import { ILanguageConfigurationService } from 'vs/editor/common/languages/languageConfigurationRegistry';

const regexpModuleLocation = '../../../../../../node_modules/vscode-regexp-languagedetection';
const regexpModuleLocationAsar = '../../../../../../node_modules.asar/vscode-regexp-languagedetection';
const moduleLocation = '../../../../../../node_modules/@vscode/vscode-languagedetection';
const moduleLocationAsar = '../../../../../../node_modules.asar/@vscode/vscode-languagedetection';
export class LanguageDetectionService extends Disposable implements ILanguageDetectionService {
static readonly enablementSettingKey = 'workbench.editor.languageDetection';
static readonly preferredLanguagesConfig = 'workbench.editor.languageDetectionPreferredLanguages';

_serviceBrand: undefined;

Expand Down Expand Up @@ -51,6 +54,9 @@ export class LanguageDetectionService extends Disposable implements ILanguageDet
this._environmentService.isBuilt && !isWeb
? FileAccess.asBrowserUri(`${moduleLocationAsar}/model/group1-shard1of1.bin`, require).toString(true)
: FileAccess.asBrowserUri(`${moduleLocation}/model/group1-shard1of1.bin`, require).toString(true),
this._environmentService.isBuilt && !isWeb
? FileAccess.asBrowserUri(`${regexpModuleLocationAsar}/dist/index.js`, require).toString(true)
: FileAccess.asBrowserUri(`${regexpModuleLocation}/dist/index.js`, require).toString(true),
languageConfigurationService
);
}
Expand All @@ -63,11 +69,16 @@ export class LanguageDetectionService extends Disposable implements ILanguageDet
if (!language) {
return undefined;
}
if (this._languageService.isRegisteredLanguageId(language)) {
return language;
}
return this._languageService.guessLanguageIdByFilepathOrFirstLine(URI.file(`file.${language}`)) ?? undefined;
}

async detectLanguage(resource: URI): Promise<string | undefined> {
const language = await this._languageDetectionWorkerClient.detectLanguage(resource);
// in ~~the future~~ this should read form recently opened editors, installed extensions, workspace files, etc. For now, just a config.
const preferredLanguages = this._configurationService.getValue<string[]>(LanguageDetectionService.preferredLanguagesConfig) ?? [];
const language = await this._languageDetectionWorkerClient.detectLanguage(resource, preferredLanguages);
if (language) {
return this.getLanguageId(language);
}
Expand Down Expand Up @@ -126,6 +137,7 @@ export class LanguageDetectionWorkerClient extends EditorWorkerClient {
private readonly _indexJsUri: string,
private readonly _modelJsonUri: string,
private readonly _weightsUri: string,
private readonly _regexpModelUri: string,
languageConfigurationService: ILanguageConfigurationService,
) {
super(modelService, true, 'languageDetectionWorkerService', languageConfigurationService);
Expand Down Expand Up @@ -160,6 +172,8 @@ export class LanguageDetectionWorkerClient extends EditorWorkerClient {
return this.getModelJsonUri();
case 'getWeightsUri':
return this.getWeightsUri();
case 'getRegexpModelUri':
return this.getRegexpModelUri();
case 'sendTelemetryEvent':
return this.sendTelemetryEvent(args[0], args[1], args[2]);
default:
Expand All @@ -179,6 +193,10 @@ export class LanguageDetectionWorkerClient extends EditorWorkerClient {
return this._weightsUri;
}

async getRegexpModelUri() {
return this._regexpModelUri;
}

async sendTelemetryEvent(languages: string[], confidences: number[], timeSpent: number): Promise<void> {
this._telemetryService.publicLog2<ILanguageDetectionStats, LanguageDetectionStatsClassification>(LanguageDetectionStatsId, {
languages: languages.join(','),
Expand All @@ -187,9 +205,9 @@ export class LanguageDetectionWorkerClient extends EditorWorkerClient {
});
}

public async detectLanguage(resource: URI): Promise<string | undefined> {
public async detectLanguage(resource: URI, userPreferredLanguages: string[]): Promise<string | undefined> {
await this._withSyncedResources([resource]);
return (await this._getProxy()).detectLanguage(resource.toString());
return (await this._getProxy()).detectLanguage(resource.toString(), userPreferredLanguages);
}
}

Expand Down

1 comment on commit 329edab

@JacksonKearl
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @TylerLeonhardt (if you have any concerns happy to go through PR process, sorry this is a bit rushed what with last day of dev and vacation and all...)

Please sign in to comment.