Skip to content

Commit

Permalink
Merge pull request microsoft#7 from Microsoft/master
Browse files Browse the repository at this point in the history
pull latest code
  • Loading branch information
chicm-ms authored Nov 29, 2018
2 parents 76d7142 + cf3d434 commit bc10bf7
Show file tree
Hide file tree
Showing 16 changed files with 184 additions and 65 deletions.
2 changes: 1 addition & 1 deletion src/nni_manager/common/datastore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ abstract class Database {
public abstract queryExperimentProfile(experimentId: string, revision?: number): Promise<ExperimentProfile[]>;
public abstract queryLatestExperimentProfile(experimentId: string): Promise<ExperimentProfile>;
public abstract storeTrialJobEvent(
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void>;
event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void>;
public abstract queryTrialJobEvent(trialJobId?: string, event?: TrialJobEvent): Promise<TrialJobEventRecord[]>;
public abstract storeMetricData(trialJobId: string, data: string): Promise<void>;
public abstract queryMetricData(trialJobId?: string, type?: MetricType): Promise<MetricDataRecord[]>;
Expand Down
39 changes: 37 additions & 2 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import * as assert from 'assert';
import { randomBytes } from 'crypto';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
Expand All @@ -32,6 +33,7 @@ import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log';

function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
Expand Down Expand Up @@ -287,5 +289,38 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus {
return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED';
}

export {getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus,
getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
/**
* Utility method to calculate file numbers under a directory, recursively
* @param directory directory name
*/
function countFilesRecursively(directory: string, timeoutMilliSeconds?: number): Promise<number> {
if(!fs.existsSync(directory)) {
throw Error(`Direcotory ${directory} doesn't exist`);
}

const deferred: Deferred<number> = new Deferred<number>();

let timeoutId : NodeJS.Timer
const delayTimeout : Promise<number> = new Promise((resolve : Function, reject : Function) : void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(() => {
reject(new Error(`Timeout: path ${directory} has too many files`));
}, 5000);
});

let fileCount: number = -1;
cpp.exec(`find ${directory} -type f | wc -l`).then((result) => {
if(result.stdout && parseInt(result.stdout)) {
fileCount = parseInt(result.stdout);
}
deferred.resolve(fileCount);
});

return Promise.race([deferred.promise, delayTimeout]).finally(() => {
clearTimeout(timeoutId);
});
}

export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
21 changes: 20 additions & 1 deletion src/nni_manager/core/nniDataStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,21 @@ class NNIDataStore implements DataStore {
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
this.log.debug(`storeTrialJobEvent: event: ${event}, data: ${hyperParameter}, jobDetail: ${JSON.stringify(jobDetail)}`);

return this.db.storeTrialJobEvent(event, trialJobId, hyperParameter, jobDetail).catch(
// Use the timestamp in jobDetail as TrialJobEvent timestamp for different events
let timestamp: number | undefined;
if (event === 'WAITING' && jobDetail) {
timestamp = jobDetail.submitTime;
} else if (event === 'RUNNING' && jobDetail) {
timestamp = jobDetail.startTime;
} else if (['EARLY_STOPPED', 'SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED'].includes(event) && jobDetail) {
timestamp = jobDetail.endTime;
}
// Use current time as timestamp if timestamp is not assigned from jobDetail
if (timestamp === undefined) {
timestamp = Date.now();
}

return this.db.storeTrialJobEvent(event, trialJobId, timestamp, hyperParameter, jobDetail).catch(
(err: Error) => {
throw new NNIError('Datastore error', `Datastore error: ${err.message}`, err);
}
Expand Down Expand Up @@ -272,6 +286,11 @@ class NNIDataStore implements DataStore {
if (record.logPath !== undefined) {
jobInfo.logPath = record.logPath;
}
// Initially assign WAITING timestamp as job's start time,
// If there is RUNNING state event, it will be updated as RUNNING state timestamp
if (jobInfo.startTime === undefined && record.timestamp !== undefined) {
jobInfo.startTime = record.timestamp;
}
break;
case 'SUCCEEDED':
case 'FAILED':
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/core/sqlDatabase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ class SqlDB implements Database {
}

public storeTrialJobEvent(
event: TrialJobEvent, trialJobId: string, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?)';
const logPath: string | undefined = jobDetail === undefined ? undefined : jobDetail.url;
const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.sequenceId;
const args: any[] = [Date.now(), trialJobId, event, hyperParameter, logPath, sequenceId];
const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId];

const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/core/test/sqlDatabase.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ describe('core/sqlDatabase', () => {
await (<SqlDB>db).storeExperimentProfile(profile);
}
for (const event of events) {
await (<SqlDB>db).storeTrialJobEvent(<TrialJobEvent>event.event, event.trialJobId, event.data);
await (<SqlDB>db).storeTrialJobEvent(<TrialJobEvent>event.event, event.trialJobId, Date.now(), event.data);
}
for (const metric of metrics) {
await (<SqlDB>db).storeMetricData(metric.trialJobId, JSON.stringify(metric));
Expand Down
48 changes: 48 additions & 0 deletions src/nni_manager/training_service/common/util.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { getLogger } from "common/log";

/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

import { countFilesRecursively } from '../../common/utils'

/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined;

try {
fileCount = await countFilesRecursively(codeDir);
} catch(error) {
throw new Error(`Call count file error: ${error}`);
}

if(fileCount && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`;
throw new Error(errMessage);
}

return fileCount;
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, NFSCon
import { KubeflowTrialJobDetail } from './kubeflowData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
import { validateCodeDir } from '../common/util';
import { AzureStorageClientUtility } from './azureStorageClientUtils';
import * as azureStorage from 'azure-storage';

Expand Down Expand Up @@ -360,6 +361,15 @@ class KubeflowTrainingService implements TrainingService {

this.kubeflowTrialConfig = <KubeflowTrialConfig>JSON.parse(value);
assert(this.kubeflowClusterConfig !== undefined && this.kubeflowTrialConfig.worker !== undefined);

// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.kubeflowTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}

break;
default:
break;
Expand Down
13 changes: 12 additions & 1 deletion src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, NNIManagerIpConfig
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { countFilesRecursively, delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { validateCodeDir } from '../common/util';

var WebHDFS = require('webhdfs');

Expand Down Expand Up @@ -395,6 +397,15 @@ class PAITrainingService implements TrainingService {
).replace(/\r\n|\n|\r/gm, '');
}

// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.paiTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
deferred.reject(new Error(error));
break;
}

const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern);

if(hdfsDirContent === null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import {
RemoteMachineTrialJobDetail, ScheduleResultType
} from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir} from '../common/util';

/**
* Training Service implementation for Remote Machine (Linux)
Expand Down Expand Up @@ -297,6 +298,15 @@ class RemoteMachineTrainingService implements TrainingService {
if (!fs.lstatSync(remoteMachineTrailConfig.codeDir).isDirectory()) {
throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
}

// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(remoteMachineTrailConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}

this.trialConfig = remoteMachineTrailConfig;
break;
case TrialConfigMetadataKey.MULTI_PHASE:
Expand Down
16 changes: 3 additions & 13 deletions src/webui/src/components/Overview.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
Experiment, TableObj,
Parameters, TrialNumber
} from '../static/interface';
import { getFinalResult } from '../static/function';
import SuccessTable from './overview/SuccessTable';
import Title1 from './overview/Title1';
import Progressed from './overview/Progress';
Expand Down Expand Up @@ -215,18 +216,7 @@ class Overview extends React.Component<{}, OverviewState> {
parameters: {}
};
const duration = (tableData[item].endTime - tableData[item].startTime) / 1000;
let acc;
let tableAcc = 0;
if (tableData[item].finalMetricData) {
acc = JSON.parse(tableData[item].finalMetricData.data);
if (typeof (acc) === 'object') {
if (acc.default) {
tableAcc = acc.default;
}
} else {
tableAcc = acc;
}
}
const acc = getFinalResult(tableData[item].finalMetricData);
// if hyperparameters is undefine, show error message, else, show parameters value
if (tableData[item].hyperParameters) {
desJobDetail.parameters = JSON.parse(tableData[item].hyperParameters).parameters;
Expand All @@ -246,7 +236,7 @@ class Overview extends React.Component<{}, OverviewState> {
id: tableData[item].id,
duration: duration,
status: tableData[item].status,
acc: tableAcc,
acc: acc,
description: desJobDetail
});
break;
Expand Down
31 changes: 5 additions & 26 deletions src/webui/src/components/TrialsDetail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { MANAGER_IP } from '../static/const';
import { Row, Col, Button, Tabs, Input } from 'antd';
const Search = Input.Search;
import { TableObj, Parameters, DetailAccurPoint, TooltipForAccuracy } from '../static/interface';
import { getFinalResult } from '../static/function';
import Accuracy from './overview/Accuracy';
import Duration from './trial-detail/Duration';
import Title1 from './overview/Title1';
Expand Down Expand Up @@ -47,24 +48,13 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
const accSource: Array<DetailAccurPoint> = [];
Object.keys(accData).map(item => {
if (accData[item].status === 'SUCCEEDED' && accData[item].finalMetricData) {
let acc;
let tableAcc;
let searchSpace: object = {};
if (accData[item].finalMetricData) {
acc = JSON.parse(accData[item].finalMetricData.data);
if (typeof (acc) === 'object') {
if (acc.default) {
tableAcc = acc.default;
}
} else {
tableAcc = acc;
}
}
const acc = getFinalResult(accData[item].finalMetricData);
if (accData[item].hyperParameters) {
searchSpace = JSON.parse(accData[item].hyperParameters).parameters;
}
accSource.push({
acc: tableAcc,
acc: acc,
index: accData[item].sequenceId,
searchSpace: JSON.stringify(searchSpace)
});
Expand Down Expand Up @@ -147,8 +137,6 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
parameters: {},
intermediate: []
};
let acc;
let tableAcc = 0;
let duration = 0;
const id = trialJobs[item].id !== undefined
? trialJobs[item].id
Expand Down Expand Up @@ -185,23 +173,14 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
}
});
desc.intermediate = mediate;
if (trialJobs[item].finalMetricData !== undefined) {
acc = JSON.parse(trialJobs[item].finalMetricData.data);
if (typeof (acc) === 'object') {
if (acc.default) {
tableAcc = acc.default;
}
} else {
tableAcc = acc;
}
}
const acc = getFinalResult(trialJobs[item].finalMetricData);
trialTable.push({
key: trialTable.length,
sequenceId: trialJobs[item].sequenceId,
id: id,
status: status,
duration: duration,
acc: tableAcc,
acc: acc,
description: desc
});
});
Expand Down
2 changes: 1 addition & 1 deletion src/webui/src/components/overview/SuccessTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class SuccessTable extends React.Component<SuccessTableProps, {}> {
:
record.acc
:
'NaN'
'--'
}
</div>
);
Expand Down
Loading

0 comments on commit bc10bf7

Please sign in to comment.