Skip to content

Commit

Permalink
Merge pull request #2352 from snyk/feat/the-unhashening
Browse files Browse the repository at this point in the history
feat: don't hash emails in monitor command
  • Loading branch information
JackuB authored Nov 15, 2021
2 parents 2d0a8ea + 2a988e2 commit cc6b97b
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 147 deletions.
10 changes: 9 additions & 1 deletion src/cli/commands/test/iac-local-execution/file-utils.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
import * as fs from 'fs';
import * as tar from 'tar';
import * as path from 'path';
import * as crypto from 'crypto';
import {
FailedToInitLocalCacheError,
LOCAL_POLICY_ENGINE_DIR,
} from './local-cache';
import { CUSTOM_RULES_TARBALL } from './oci-pull';
import { hashData } from '../../../../lib/monitor/dev-count-analysis';

function hashData(s: string): string {
const hashedData = crypto
.createHash('sha1')
.update(s)
.digest('hex');
return hashedData;
}

export function createIacDir(): void {
// this path will be able to be customised by the user in the future
Expand Down
63 changes: 18 additions & 45 deletions src/lib/monitor/dev-count-analysis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
* "Contributing" is defined as having contributed a commit in the last 90 days.
* This is use only on the `snyk monitor` command as that is used to monitor a project's dependencies in an
* on-going manner.
* It collects only a hash of the email of a git user and the most recent commit timestamp (both per the `git log`
* It collects the email of a git user and the most recent commit timestamp (both per the `git log`
* output) and can be disabled by config (see https://snyk.io/policies/tracking-and-analytics/).
*/
import * as crypto from 'crypto';
import { exec } from 'child_process';
import { Contributor } from '../types';

Expand Down Expand Up @@ -37,16 +36,12 @@ export async function getContributors(
}

export class GitCommitInfo {
authorHashedEmail: string;
authorEmail: string;
commitTimestamp: string; // use ISO 8601 format

constructor(authorHashedEmail: string, commitTimestamp: string) {
if (isSha1Hash(authorHashedEmail)) {
this.authorHashedEmail = authorHashedEmail;
this.commitTimestamp = commitTimestamp;
} else {
throw new Error('authorHashedEmail must be a sha1 hash');
}
constructor(authorEmail: string, commitTimestamp: string) {
this.authorEmail = authorEmail;
this.commitTimestamp = commitTimestamp;
}
}

Expand All @@ -61,40 +56,38 @@ export class GitRepoCommitStats {
return new GitRepoCommitStats([]);
}

public addCommitInfo(info: GitCommitInfo) {
public addCommitInfo(info: GitCommitInfo): void {
this.commitInfos.push(info);
}

public getUniqueAuthorsCount(): number {
const uniqueAuthorHashedEmails = this.getUniqueAuthorHashedEmails();
return uniqueAuthorHashedEmails.size;
const uniqueAuthorEmails = this.getUniqueAuthorEmails();
return uniqueAuthorEmails.size;
}

public getCommitsCount(): number {
return this.commitInfos.length;
}

public getUniqueAuthorHashedEmails(): Set<string> {
public getUniqueAuthorEmails(): Set<string> {
const allCommitAuthorHashedEmails: string[] = this.commitInfos.map(
(c) => c.authorHashedEmail,
(c) => c.authorEmail,
);
const uniqueAuthorHashedEmails: Set<string> = new Set(
const uniqueAuthorEmails: Set<string> = new Set(
allCommitAuthorHashedEmails,
);
return uniqueAuthorHashedEmails;
return uniqueAuthorEmails;
}

public getRepoContributors(): Contributor[] {
const uniqueAuthorHashedEmails = this.getUniqueAuthorHashedEmails();
const uniqueAuthorEmails = this.getUniqueAuthorEmails();
const contributors: Contributor[] = [];

// for each uniqueAuthorHashedEmails, get the latest commit
for (const nextUniqueAuthorHashedEmail of uniqueAuthorHashedEmails) {
for (const nextUniqueAuthorEmail of uniqueAuthorEmails) {
const latestCommitTimestamp = this.getMostRecentCommitTimestamp(
nextUniqueAuthorHashedEmail,
nextUniqueAuthorEmail,
);
contributors.push({
userId: nextUniqueAuthorHashedEmail,
email: nextUniqueAuthorEmail,
lastCommitDate: latestCommitTimestamp,
});
}
Expand All @@ -103,7 +96,7 @@ export class GitRepoCommitStats {

public getMostRecentCommitTimestamp(authorHashedEmail: string): string {
for (const nextGI of this.commitInfos) {
if (nextGI.authorHashedEmail === authorHashedEmail) {
if (nextGI.authorEmail === authorHashedEmail) {
return nextGI.commitTimestamp;
}
}
Expand All @@ -115,8 +108,7 @@ export function parseGitLogLine(logLine: string): GitCommitInfo {
const lineComponents = logLine.split(SERIOUS_DELIMITER);
const authorEmail = lineComponents[2];
const commitTimestamp = lineComponents[3];
const hashedAuthorEmail = hashData(authorEmail);
const commitInfo = new GitCommitInfo(hashedAuthorEmail, commitTimestamp);
const commitInfo = new GitCommitInfo(authorEmail, commitTimestamp);
return commitInfo;
}

Expand All @@ -130,25 +122,6 @@ export function parseGitLog(gitLog: string): GitRepoCommitStats {
return stats;
}

export function hashData(s: string): string {
const hashedData = crypto
.createHash('sha1')
.update(s)
.digest('hex');
return hashedData;
}

export function isSha1Hash(data: string): boolean {
// sha1 hash must be exactly 40 characters of 0-9 / a-f (i.e. lowercase hex characters)
// ^ == start anchor
// [0-9a-f] == characters 0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f only
// {40} 40 of the [0-9a-f] characters
// $ == end anchor
const matchRegex = new RegExp('^[0-9a-f]{40}$');
const looksHashed = matchRegex.test(data);
return looksHashed;
}

/**
* @returns time stamp in seconds-since-epoch of 90 days ago since 90 days is the "contributing devs" timeframe
*/
Expand Down
2 changes: 1 addition & 1 deletion src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export interface ProtectOptions {
}

export interface Contributor {
userId: string;
email: string;
lastCommitDate: string;
}

Expand Down
31 changes: 13 additions & 18 deletions test/dev-count-analysis.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@ import {
SERIOUS_DELIMITER,
MAX_COMMITS_IN_GIT_LOG,
separateLines,
hashData,
} from '../src/lib/monitor/dev-count-analysis';

const testTimeout = 60000;

const TIMESTAMP_TO_TEST = 1590174610000;

describe('cli dev count via git log analysis', () => {
let expectedContributorUserIds: string[] = [];
let expectedMergeOnlyUserIds: string[] = [];
let expectedContributoremails: string[] = [];
let expectedMergeOnlyemails: string[] = [];

// this computes the expectedContributorUserIds and expectedMergeOnlyUserIds
// this computes the expectedContributoremails and expectedMergeOnlyemails
beforeAll(async () => {
const timestampEpochSecondsEndOfPeriod = Math.floor(
TIMESTAMP_TO_TEST / 1000,
Expand Down Expand Up @@ -53,12 +52,8 @@ describe('cli dev count via git log analysis', () => {
}
}

expectedContributorUserIds = uniqueEmailsContainingAtLeastOneNonMergeCommit.map(
hashData,
);
expectedMergeOnlyUserIds = uniqueEmailsContainingOnlyMergeCommits.map(
hashData,
);
expectedContributoremails = uniqueEmailsContainingAtLeastOneNonMergeCommit;
expectedMergeOnlyemails = uniqueEmailsContainingOnlyMergeCommits;
}, testTimeout);

it(
Expand All @@ -69,9 +64,9 @@ describe('cli dev count via git log analysis', () => {
periodDays: 10,
repoPath: process.cwd(),
});
const contributorUserIds = contributors.map((c) => c.userId);
expect(contributorUserIds.sort()).toEqual(
expectedContributorUserIds.sort(),
const contributoremails = contributors.map((c) => c.email);
expect(contributoremails.sort()).toEqual(
expectedContributoremails.sort(),
);
},
testTimeout,
Expand All @@ -85,13 +80,13 @@ describe('cli dev count via git log analysis', () => {
periodDays: 10,
repoPath: process.cwd(),
});
const contributorUserIds = contributors.map((c) => c.userId);
const contributoremails = contributors.map((c) => c.email);

// make sure none of uniqueEmailsContainingOnlyMergeCommits are in contributorUserIds
const legitUserIdsWhichAreAlsoInMergeOnlyUserIds = expectedMergeOnlyUserIds.filter(
(user) => contributorUserIds.includes(user),
// make sure none of uniqueEmailsContainingOnlyMergeCommits are in contributoremails
const legitemailsWhichAreAlsoInMergeOnlyemails = expectedMergeOnlyemails.filter(
(user) => contributoremails.includes(user),
);
expect(legitUserIdsWhichAreAlsoInMergeOnlyUserIds).toHaveLength(0);
expect(legitemailsWhichAreAlsoInMergeOnlyemails).toHaveLength(0);
},
testTimeout,
);
Expand Down
101 changes: 19 additions & 82 deletions test/dev-count-analysis.test.ts
Original file line number Diff line number Diff line change
@@ -1,66 +1,22 @@
import { test } from 'tap';
const osName = require('os-name');

const isWindows =
osName()
.toLowerCase()
.indexOf('windows') === 0;

import {
GitCommitInfo,
parseGitLog,
GitRepoCommitStats,
parseGitLogLine,
isSha1Hash,
hashData,
getTimestampStartOfContributingDevTimeframe,
separateLines,
execShell,
ShellOutError,
runGitLog,
} from '../src/lib/monitor/dev-count-analysis';

const expectedEmailHashes = {
'someemail-1@somedomain.com': '069598f5bf317927731aecc6648bd521f6a12c92',
'someemail-2@somedomain.com': '15726593ee5b5182412ca858e2472477f4ce9f30',
};

test('hashData works', (t) => {
t.plan(2);

const email1 = 'someemail-1@somedomain.com';
const hashedEmail1 = hashData(email1);
t.equal(hashedEmail1, expectedEmailHashes['someemail-1@somedomain.com']);

const email2 = 'someemail-2@somedomain.com';
const hashedEmail2 = hashData(email2);
t.equal(hashedEmail2, expectedEmailHashes['someemail-2@somedomain.com']);
});

test('isSha1Hash works', (t) => {
t.plan(6);
t.ok(isSha1Hash('069598f5bf317927731aecc6648bd521f6a12c92'));
t.ok(isSha1Hash('0123456789abcdef0123456789abcdef01234567')); // all the possible hex characters
t.notOk(isSha1Hash('abcdefghijklmnopqrstuvwxyz01234567890')); // contains letters which are not hex characters
t.notOk(isSha1Hash('0123456789abcdef0123456789abcdef01234567a')); // more than 40 characters
t.notOk(isSha1Hash('0123456789abcdef0123456789abcdef0123456')); // less than 40 characters
t.notOk(isSha1Hash('someemail-1@somedomain.com')); // obviously an email
});
const osName = require('os-name');

test('you cannot create a new GitCommitInfo if the email is not hashed', (t) => {
t.plan(1);
try {
new GitCommitInfo(
'someemail-1@somedomain.com',
'2020-02-06T11:43:11+00:00',
);
t.fail(
'GitCommitInfo constructor should throw exception if you try to pass a non-hahsed email',
);
} catch (err) {
t.pass();
}
});
const isWindows =
osName()
.toLowerCase()
.indexOf('windows') === 0;

test('can calculate start of contributing developer period', (t) => {
t.plan(1);
Expand All @@ -76,10 +32,7 @@ test('can parse a git log line', (t) => {
const line =
'0bd4d3c394a54ba54f6c44705ac73d7d87b39525_SNYK_SEPARATOR_some-user_SNYK_SEPARATOR_someemail-1@somedomain.com_SNYK_SEPARATOR_2020-02-06T11:43:11+00:00';
const commitInfo: GitCommitInfo = parseGitLogLine(line);
t.equal(
commitInfo.authorHashedEmail,
expectedEmailHashes['someemail-1@somedomain.com'],
);
t.equal(commitInfo.authorEmail, 'someemail-1@somedomain.com');
});

test('can handle an empty git log', (t) => {
Expand Down Expand Up @@ -157,68 +110,52 @@ test('runGitLog returns empty string and does not throw error when git log comma
});

function validateGitParsing(gitLog: string, t) {
t.plan(17);
t.plan(13);
const stats: GitRepoCommitStats = parseGitLog(gitLog);

t.equal(stats.getCommitsCount(), 3);
t.equal(stats.getUniqueAuthorsCount(), 2);

const uniqueAuthors: Set<string> = stats.getUniqueAuthorHashedEmails();
const uniqueAuthors: Set<string> = stats.getUniqueAuthorEmails();
t.equal(uniqueAuthors.size, 2);

t.notOk(uniqueAuthors.has('someemail-1@somedomain.com'));
t.notOk(uniqueAuthors.has('someemail-2@somedomain.com'));
t.ok(uniqueAuthors.has(expectedEmailHashes['someemail-1@somedomain.com']));
t.ok(uniqueAuthors.has(expectedEmailHashes['someemail-2@somedomain.com']));
t.ok(uniqueAuthors.has('someemail-1@somedomain.com'));
t.ok(uniqueAuthors.has('someemail-2@somedomain.com'));

const mostRecentCommitTimestampSomeEmail1 = stats.getMostRecentCommitTimestamp(
expectedEmailHashes['someemail-1@somedomain.com'],
'someemail-1@somedomain.com',
);
t.equal(mostRecentCommitTimestampSomeEmail1, '2020-02-06T11:43:11+00:00');
const mostRecentCommitTimestampSomeEmail2 = stats.getMostRecentCommitTimestamp(
expectedEmailHashes['someemail-2@somedomain.com'],
'someemail-2@somedomain.com',
);
t.equal(mostRecentCommitTimestampSomeEmail2, '2020-02-02T23:31:13+02:00');
t.equal(stats.getMostRecentCommitTimestamp('missing-email'), '');

const contributors: {
userId: string;
email: string;
lastCommitDate: string;
}[] = stats.getRepoContributors();
t.equal(contributors.length, 2);

t.notOk(
contributors.map((c) => c.userId).includes('someemail-1@somedomain.com'),
);
t.notOk(
contributors.map((c) => c.userId).includes('someemail-2@somedomain.com'),
);
t.ok(
contributors
.map((c) => c.userId)
.includes(expectedEmailHashes['someemail-1@somedomain.com']),
);
t.ok(
contributors
.map((c) => c.userId)
.includes(expectedEmailHashes['someemail-2@somedomain.com']),
);
t.ok(contributors.map((c) => c.email).includes('someemail-1@somedomain.com'));
t.ok(contributors.map((c) => c.email).includes('someemail-2@somedomain.com'));

const getTimestampById = (userId: string): string => {
const getTimestampById = (email: string): string => {
for (const c of contributors) {
if (c.userId === userId) {
if (c.email === email) {
return c.lastCommitDate;
}
}
throw new Error('contributor not found');
};

t.equal(
getTimestampById(expectedEmailHashes['someemail-1@somedomain.com']),
getTimestampById('someemail-1@somedomain.com'),
'2020-02-06T11:43:11+00:00',
);
t.equal(
getTimestampById(expectedEmailHashes['someemail-2@somedomain.com']),
getTimestampById('someemail-2@somedomain.com'),
'2020-02-02T23:31:13+02:00',
);
}
Expand Down

0 comments on commit cc6b97b

Please sign in to comment.