Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash imports instead of using a known list #4860

Merged
merged 7 commits into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/3 Code Health/4852.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hash imports so we can get every import and not just known ones. We have to unhash them on the other side to determine if a specific package or not.
10 changes: 4 additions & 6 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2240,6 +2240,7 @@
"fuzzy": "^0.1.3",
"get-port": "^3.2.0",
"glob": "^7.1.2",
"hash.js": "^1.1.7",
"iconv-lite": "^0.4.21",
"inversify": "^4.11.1",
"line-by-line": "^0.1.6",
Expand Down
32 changes: 0 additions & 32 deletions src/client/telemetry/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,35 +74,3 @@ export enum PlatformErrors {
FailedToParseVersion = 'FailedToParseVersion',
FailedToDetermineOS = 'FailedToDetermineOS'
}

export enum KnownImports {
// Don't change the order of these as they are the value returned in telemetry and changing
// the order will break old telemetry data.
//
// This list was generated from here: https://activewizards.com/blog/top-20-python-libraries-for-data-science-in-2018/
pandas,
numpy,
matlplotlib,
scipy,
sklearn,
statsmodels,
seaborn,
plotly,
bokeh,
pydot,
xgboost,
lightgbm,
catboost,
eli5,
tensorflow,
pytorch,
keras,
distkeras,
elephas,
pyspark,
nltk,
spacy,
gensim,
scrapy,
sparkdl
}
83 changes: 37 additions & 46 deletions src/client/telemetry/importTracker.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

'use strict';
import '../common/extensions';

import { inject, injectable } from 'inversify';
import * as path from 'path';
import { TextDocument } from 'vscode';

import { sendTelemetryEvent } from '.';
import { sleep } from '../../test/core';
import { noop, sleep } from '../../test/core';
import { IDocumentManager } from '../common/application/types';
import '../common/extensions';
import { IHistoryProvider } from '../datascience/types';
import { ICodeExecutionManager } from '../terminals/types';
import { EventName, KnownImports } from './constants';
import { EventName } from './constants';
import { IImportTracker } from './types';

const ImportRegEx = /^(?!['"#]).*from\s+([a-zA-Z0-9_\.]+)\s+import.*(?!['"])|^(?!['"#]).*import\s+([a-zA-Z0-9_\., ]+).*(?!['"])/;
Expand All @@ -22,24 +21,15 @@ const MAX_DOCUMENT_LINES = 1000;
@injectable()
export class ImportTracker implements IImportTracker {

private knownImportsMatch: RegExp;
private sentMatches: Set<string> = new Set<string>();
// tslint:disable-next-line:no-require-imports
private hashFn = require('hash.js').sha256;

constructor(
@inject(IDocumentManager) private documentManager: IDocumentManager,
@inject(IHistoryProvider) private historyProvider: IHistoryProvider,
@inject(ICodeExecutionManager) private executionManager: ICodeExecutionManager
) {
// Construct a regex that will match known imports
let matchString = '';
Object.keys(KnownImports).forEach(k => {
const val = KnownImports[k];
if (typeof val === 'string') {
matchString += `${val}|`;
}
});
this.knownImportsMatch = new RegExp(matchString.slice(0, matchString.length - 1), 'g');

// Sign up for document open/save events so we can track known imports
this.documentManager.onDidOpenTextDocument((t) => this.onOpenedOrSavedDocument(t));
this.documentManager.onDidSaveTextDocument((t) => this.onOpenedOrSavedDocument(t));
Expand All @@ -53,7 +43,7 @@ export class ImportTracker implements IImportTracker {
this.executionManager.onExecutedCode(c => this.onExecutedCode(c));
}

public async activate() : Promise<void> {
public async activate(): Promise<void> {
// Act like all of our open documents just opened. Don't do this now though. We don't want
// to hold up the activate.
await sleep(1000);
Expand All @@ -62,8 +52,7 @@ export class ImportTracker implements IImportTracker {

private onOpenedOrSavedDocument(document: TextDocument) {
// Make sure this is a python file.
if (path.extname(document.fileName) === '.py')
{
if (path.extname(document.fileName) === '.py') {
// Parse the contents of the document, looking for import matches on each line
const lines = document.getText().splitLines({ trim: true, removeEmptyEntries: true });
this.lookForImports(lines.slice(0, Math.min(lines.length, MAX_DOCUMENT_LINES)), EventName.KNOWN_IMPORT_FROM_FILE);
Expand All @@ -76,38 +65,40 @@ export class ImportTracker implements IImportTracker {
}

private lookForImports(lines: string[], eventName: string) {
// Use a regex to parse each line, looking for imports
const matches: Set<string> = new Set<string>();
for (const s of lines) {
const match = ImportRegEx.exec(s);
if (match && match.length > 2) {
// Could be a from or a straight import. from is the first entry.
const actual = match[1] ? match[1] : match[2];

// See if this matches any known imports
let knownMatch = this.knownImportsMatch.exec(actual);
while (knownMatch) {
knownMatch.forEach(val => {
// Skip if already sent this telemetry
if (!this.sentMatches.has(val)) {
matches.add(val);
try {
// Use a regex to parse each line, looking for imports
const matches: Set<string> = new Set<string>();
for (const s of lines) {
const match = ImportRegEx.exec(s);
if (match && match.length > 2) {
// Could be a from or a straight import. from is the first entry.
const actual = match[1] ? match[1] : match[2];

// Use just the bits to the left of ' as '
const left = actual.split(' as ')[0];

// Now split this based on, and chop off all .
const baseNames = left.split(',').map(l => l.split('.')[0].trim());
baseNames.forEach(l => {
// Hash this value and save this in our import
const hash = this.hashFn().update(l).digest('hex');
if (!this.sentMatches.has(hash)) {
matches.add(hash);
}
});
knownMatch = this.knownImportsMatch.exec(actual);
}

// Reset search.
this.knownImportsMatch.lastIndex = 0;
}
}

// For each unique match, emit a new telemetry event.
matches.forEach(s => {
sendTelemetryEvent(
eventName === EventName.KNOWN_IMPORT_FROM_FILE ? EventName.KNOWN_IMPORT_FROM_FILE : EventName.KNOWN_IMPORT_FROM_EXECUTION,
0,
{ import: s });
this.sentMatches.add(s);
});
// For each unique match, emit a new telemetry event.
matches.forEach(s => {
sendTelemetryEvent(
eventName === EventName.KNOWN_IMPORT_FROM_FILE ? EventName.KNOWN_IMPORT_FROM_FILE : EventName.KNOWN_IMPORT_FROM_EXECUTION,
0,
{ import: s });
this.sentMatches.add(s);
});
} catch {
noop();
}
}
}
36 changes: 23 additions & 13 deletions src/test/telemetry/importTracker.unit.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
'use strict';

//tslint:disable:max-func-body-length match-default-export-name no-any no-multiline-string no-trailing-whitespace
import { expect } from 'chai';
import rewiremock from 'rewiremock';
Expand All @@ -10,14 +9,16 @@ import { EventEmitter, TextDocument } from 'vscode';

import { IDocumentManager } from '../../client/common/application/types';
import { IHistoryProvider } from '../../client/datascience/types';
import { EventName } from '../../client/telemetry/constants';
import { EventName } from '../../client/telemetry/constants';
import { ImportTracker } from '../../client/telemetry/importTracker';
import { ICodeExecutionManager } from '../../client/terminals/types';
import { createDocument } from '../datascience/editor-integration/helpers';

suite('Import Tracker', () => {
const oldValueOfVSC_PYTHON_UNIT_TEST = process.env.VSC_PYTHON_UNIT_TEST;
const oldValueOfVSC_PYTHON_CI_TEST = process.env.VSC_PYTHON_CI_TEST;
// tslint:disable-next-line:no-require-imports
const hashJs = require('hash.js');
let importTracker: ImportTracker;
let documentManager: TypeMoq.IMock<IDocumentManager>;
let historyProvider: TypeMoq.IMock<IHistoryProvider>;
Expand All @@ -26,6 +27,15 @@ suite('Import Tracker', () => {
let savedEventEmitter: EventEmitter<TextDocument>;
let historyEventEmitter: EventEmitter<string>;
let codeExecutionEmitter: EventEmitter<string>;
const pandasHash = hashJs.sha256().update('pandas').digest('hex');
const elephasHash = hashJs.sha256().update('elephas').digest('hex');
const kerasHash = hashJs.sha256().update('keras').digest('hex');
const pysparkHash = hashJs.sha256().update('pyspark').digest('hex');
const sparkdlHash = hashJs.sha256().update('sparkdl').digest('hex');
const numpyHash = hashJs.sha256().update('numpy').digest('hex');
const scipyHash = hashJs.sha256().update('scipy').digest('hex');
const sklearnHash = hashJs.sha256().update('sklearn').digest('hex');
const randomHash = hashJs.sha256().update('random').digest('hex');

class Reporter {
public static eventNames: string[] = [];
Expand Down Expand Up @@ -79,7 +89,7 @@ suite('Import Tracker', () => {
emitDocEvent('import pandas\r\n', openedEventEmitter);

expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]);
});

test('Already opened documents', async () => {
Expand All @@ -88,27 +98,27 @@ suite('Import Tracker', () => {
await importTracker.activate();

expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]);
});

test('Save document', () => {
emitDocEvent('import pandas\r\n', savedEventEmitter);

expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_FILE]);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]);
});

test('Execute', () => {
historyEventEmitter.fire('import pandas\r\n');

expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_EXECUTION]);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]);

codeExecutionEmitter.fire('import pandas\r\n');

// Should not emit another event.
expect(Reporter.eventNames).to.deep.equal([EventName.KNOWN_IMPORT_FROM_EXECUTION]);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }]);
});

test('elephas', () => {
Expand All @@ -135,7 +145,7 @@ suite('Import Tracker', () => {
model.set_weights(weights)`;

historyEventEmitter.fire(elephas);
expect(Reporter.properties).to.deep.equal([{ import: 'elephas' }, { import: 'keras' }]);
expect(Reporter.properties).to.deep.equal([{ import: elephasHash }, { import: kerasHash }]);
});

test('pyspark', () => {
Expand All @@ -157,7 +167,7 @@ suite('Import Tracker', () => {
print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))`;

historyEventEmitter.fire(pyspark);
expect(Reporter.properties).to.deep.equal([{ import: 'pyspark' }, { import: 'sparkdl' }]);
expect(Reporter.properties).to.deep.equal([{ import: pysparkHash }, { import: sparkdlHash }]);
});

test('numpy', () => {
Expand All @@ -173,7 +183,7 @@ def simplify_ages(df):
df.Age = categories
return df`;
historyEventEmitter.fire(code);
expect(Reporter.properties).to.deep.equal([{ import: 'pandas' }, { import: 'numpy' }]);
expect(Reporter.properties).to.deep.equal([{ import: pandasHash }, { import: numpyHash }, { import: randomHash }]);
});

test('scipy', () => {
Expand All @@ -187,7 +197,7 @@ x = np.array([r * np.cos(theta) for r in radius])
y = np.array([r * np.sin(theta) for r in radius])
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
historyEventEmitter.fire(code);
expect(Reporter.properties).to.deep.equal([{ import: 'scipy' }]);
expect(Reporter.properties).to.deep.equal([{ import: scipyHash }]);
});

test('function', () => {
Expand All @@ -201,7 +211,7 @@ x = np.array([r * np.cos(theta) for r in radius])
y = np.array([r * np.sin(theta) for r in radius])
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
historyEventEmitter.fire(code);
expect(Reporter.properties).to.deep.equal([{ import: 'sklearn' }]);
expect(Reporter.properties).to.deep.equal([{ import: sklearnHash }]);
});

test('Comma separated', () => {
Expand All @@ -215,7 +225,7 @@ x = np.array([r * np.cos(theta) for r in radius])
y = np.array([r * np.sin(theta) for r in radius])
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
historyEventEmitter.fire(code);
expect(Reporter.properties).to.deep.equal([{ import: 'sklearn' }, { import: 'pandas' }]);
expect(Reporter.properties).to.deep.equal([{ import: sklearnHash }, { import: pandasHash }]);
});

// That's probably enough different variants of code to verify nothing is wonky.
Expand Down