Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Add api nni.get_sequence_id() (#203)
Browse files Browse the repository at this point in the history
* Pull latest code (#2)

* webui logpath and document (#135)

* Add webui document and logpath as a href

* fix tslint

* fix comments by Chengmin

* Pai training service bug fix and enhancement (#136)

* Add NNI installation scripts

* Update pai script, update NNI_out_dir

* Update NNI dir in nni sdk local.py

* Create .nni folder in nni sdk local.py

* Add check before creating .nni folder

* Fix typo for PAI_INSTALL_NNI_SHELL_FORMAT

* Improve annotation (#138)

* Improve annotation

* Minor bugfix

* Selectively install through pip (#139)

Selectively install through pip 
* update setup.py

* fix paiTrainingService bugs (#137)

* fix nnictl bug

* add hdfs host validation

* fix bugs

* fix dockerfile

* fix install.sh

* update install.sh

* fix dockerfile

* Set timeout for HDFSUtility exists function

* remove unused TODO

* fix sdk

* add optional for outputDir and dataDir

* refactor dockerfile.base

* Remove unused import in hdfsclientUtility

* Add documentation for NNI PAI mode experiment (#141)

* Add documentation for NNI PAI mode

* Fix typo based on PR comments

* Exit with subprocess return code of trial keeper

* Remove additional exit code

* Fix typo based on PR comments

* update doc for smac tuner (#140)

* Revert "Selectively install through pip (#139)" due to potential pip install issue (#142)

* Revert "Selectively install through pip (#139)"

This reverts commit 1d17483.

* Add exit code of subprocess for trial_keeper

* Update README, add link to PAImode doc

* fix bug (#147)

* Refactor nnictl and add config_pai.yml (#144)

* fix nnictl bug

* add hdfs host validation

* fix bugs

* fix dockerfile

* fix install.sh

* update install.sh

* fix dockerfile

* Set timeout for HDFSUtility exists function

* remove unused TODO

* fix sdk

* add optional for outputDir and dataDir

* refactor dockerfile.base

* Remove unused import in hdfsclientUtility

* add config_pai.yml

* refactor nnictl create logic and add colorful print

* fix nnictl stop logic

* add annotation for config_pai.yml

* add document for start experiment

* fix config.yml

* fix document

* Fix trial keeper wrongly exit issue (#152)

* Fix trial keeper bug, use actual exitcode to exit rather than 1

* Fix bug of table sort (#145)

* Update doc for PAIMode and v0.2 release notes (#153)

* Update v0.2 documentation regards to release note and PAI training service

* Update document to describe NNI docker image

* Bug fix for SQuAD example tuner. (#134)

* Update Makefile (#151)

* test

* update setup.py

* update Makefile and install.sh

* rever setup.py

* change color

* update doc

* update doc

* fix auto-completion's extra space

* update Makefile

* update webui

* Update doc image (#163)

* update doc

* trivial

* trivial

* trivial

* trivial

* trivial

* trivial

* update image

* update image size

* Update ga squad (#104)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* sklearn examples (#169)

* fix nnictl bug

* fix install.sh

* add sklearn-regression example

* add sklearn classification

* update sklearn

* update example

* remove additional code

* Update batch tuner (#158)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* update batch tuner

* Quickly fix cascading search space bug in tuner (#156)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* quickly fix cascading searchspace bug in tuner

* Add iterative search space example (#119)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* add iterative search space example

* update

* update readme

* change name

* Add api nni.get_sequence_id()

* Add sequence_id to TrialJobDetail
  • Loading branch information
chicm-ms authored Oct 12, 2018
1 parent f4ee9f8 commit 1388d76
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/nni_manager/common/trainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ interface TrialJobDetail {
readonly url?: string;
readonly workingDirectory: string;
readonly form: JobApplicationForm;
readonly sequenceId: number;
}

interface HostJobDetail {
Expand Down
6 changes: 4 additions & 2 deletions src/nni_manager/core/test/mockedTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class MockedTrainingService extends TrainingService {
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
},
sequenceId: 0
};
public jobDetail2: TrialJobDetail = {
id: '3456',
Expand All @@ -55,7 +56,8 @@ class MockedTrainingService extends TrainingService {
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
},
sequenceId: 0
};

public listTrialJobs(): Promise<TrialJobDetail[]> {
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/rest_server/test/mockedNNIManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ export class MockedNNIManager extends Manager {
// tslint:disable-next-line:no-http-string
url: 'http://test',
workingDirectory: '/tmp/mocked',
sequenceId: 0,
form: {
jobType: 'TRIAL'
}
Expand Down
25 changes: 23 additions & 2 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { EventEmitter } from 'events';
Expand Down Expand Up @@ -73,15 +74,18 @@ class LocalTrialJobDetail implements TrialJobDetail {
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public sequenceId: number;
public pid?: number;

constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) {
constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) {
this.id = id;
this.status = status;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.url = `file://localhost:${workingDirectory}`;
this.sequenceId = sequenceId;
}
}

Expand All @@ -95,6 +99,7 @@ class LocalTrainingService implements TrainingService {
private initialized: boolean;
private stopping: boolean;
private rootDir!: string;
private trialSequenceId: number;
protected log: Logger;
protected localTrailConfig?: TrialConfig;

Expand All @@ -105,6 +110,7 @@ class LocalTrainingService implements TrainingService {
this.initialized = false;
this.stopping = false;
this.log = getLogger();
this.trialSequenceId = 0;
}

public async run(): Promise<void> {
Expand Down Expand Up @@ -194,7 +200,9 @@ class LocalTrainingService implements TrainingService {
'WAITING',
Date.now(),
path.join(this.rootDir, 'trials', trialJobId),
form);
form,
this.generateSequenceId()
);
this.jobQueue.push(trialJobId);
this.jobMap.set(trialJobId, trialJobDetail);

Expand Down Expand Up @@ -344,6 +352,7 @@ class LocalTrainingService implements TrainingService {
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`);
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8' });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
await this.writeSequenceIdFile(trialJobId);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`);

this.setTrialJobStatus(trialJobDetail, 'RUNNING');
Expand Down Expand Up @@ -383,6 +392,7 @@ class LocalTrainingService implements TrainingService {
submitTime: Date.now(),
workingDirectory: workDir,
form: form,
sequenceId: this.generateSequenceId(),
pid: process.pid
};
this.jobMap.set(jobId, jobDetail);
Expand Down Expand Up @@ -415,6 +425,17 @@ class LocalTrainingService implements TrainingService {
const filepath: string = path.join(directory, generateParamFileName(hyperParameters));
await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' });
}

private generateSequenceId(): number {
return this.trialSequenceId++;
}

private async writeSequenceIdFile(trialJobId: string): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
assert(trialJobDetail !== undefined);
const filepath: string = path.join(trialJobDetail.workingDirectory, '.nni', 'sequence_id');
await fs.promises.writeFile(filepath, trialJobDetail.sequenceId.toString(), { encoding: 'utf8' });
}
}

export { LocalTrainingService };
4 changes: 3 additions & 1 deletion src/nni_manager/training_service/pai/paiData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ export class PAITrialJobDetail implements TrialJobDetail {
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public sequenceId: number;
public hdfsLogPath: string;

constructor(id: string, status: TrialJobStatus, paiJobName : string,
submitTime: number, workingDirectory: string, form: JobApplicationForm, hdfsLogPath: string) {
submitTime: number, workingDirectory: string, form: JobApplicationForm, sequenceId: number, hdfsLogPath: string) {
this.id = id;
this.status = status;
this.paiJobName = paiJobName;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.sequenceId = sequenceId;
this.tags = [];
this.hdfsLogPath = hdfsLogPath;
}
Expand Down
12 changes: 11 additions & 1 deletion src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class PAITrainingService implements TrainingService {
private readonly hdfsDirPattern: string;
private hdfsBaseDir: string | undefined;
private hdfsOutputHost: string | undefined;
private trialSequenceId: number;

constructor() {
this.log = getLogger();
Expand All @@ -77,6 +78,7 @@ class PAITrainingService implements TrainingService {
this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.trialSequenceId = 0;
}

public async run(): Promise<void> {
Expand Down Expand Up @@ -146,13 +148,15 @@ class PAITrainingService implements TrainingService {
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);

const trialJobId: string = uniqueString(5);
const trialSequenceId: number = this.generateSequenceId();
//TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);

const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);

const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
Expand All @@ -163,6 +167,7 @@ class PAITrainingService implements TrainingService {
if(trialForm) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
await fs.promises.writeFile(path.join(trialLocalTempFolder, '.nni', 'sequence_id'), trialSequenceId.toString(), { encoding: 'utf8' });
}

// Step 1. Prepare PAI job configuration
Expand All @@ -181,7 +186,8 @@ class PAITrainingService implements TrainingService {
paiJobName,
Date.now(),
trialWorkingFolder,
form,
form,
trialSequenceId,
hdfsLogPath);
this.trialJobsMap.set(trialJobId, trialJobDetail);

Expand Down Expand Up @@ -439,6 +445,10 @@ class PAITrainingService implements TrainingService {
public get MetricsEmitter() : EventEmitter {
return this.metricsEmitter;
}

private generateSequenceId(): number {
return this.trialSequenceId++;
}
}

export { PAITrainingService }
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,17 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public sequenceId: number;
public rmMeta?: RemoteMachineMeta;

constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) {
constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) {
this.id = id;
this.status = status;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.sequenceId = sequenceId;
this.tags = [];
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class RemoteMachineTrainingService implements TrainingService {
private stopping: boolean = false;
private metricsEmitter: EventEmitter;
private log: Logger;
private trialSequenceId: number;

constructor(@component.Inject timer: ObservableTimer) {
this.metricsEmitter = new EventEmitter();
Expand All @@ -75,6 +76,7 @@ class RemoteMachineTrainingService implements TrainingService {
this.remoteExpRootDir = this.getRemoteExperimentRootDir();
this.timer = timer;
this.log = getLogger();
this.trialSequenceId = 0;
}

/**
Expand Down Expand Up @@ -183,7 +185,9 @@ class RemoteMachineTrainingService implements TrainingService {
'WAITING',
Date.now(),
trialWorkingFolder,
form);
form,
this.generateSequenceId()
);
this.jobQueue.push(trialJobId);
this.trialJobsMap.set(trialJobId, trialJobDetail);

Expand Down Expand Up @@ -456,7 +460,7 @@ class RemoteMachineTrainingService implements TrainingService {
path.join(trialWorkingFolder, '.nni', 'code'));

//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);

// Write file content ( run.sh and parameter.cfg ) to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), runScriptContent, { encoding: 'utf8' });
Expand All @@ -465,6 +469,7 @@ class RemoteMachineTrainingService implements TrainingService {
await SSHClientUtility.copyFileToRemote(
path.join(trialLocalTempFolder, 'run.sh'), path.join(trialWorkingFolder, 'run.sh'), sshClient);
await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta);
await this.writeSequenceIdFile(trialJobId, rmScheduleInfo.rmMeta);

// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient);
Expand All @@ -491,7 +496,9 @@ class RemoteMachineTrainingService implements TrainingService {
path.join(localDir, 'run.sh'), path.join(remoteDir, 'run.sh'), sshClient);
SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteDir, 'run.sh')}`, sshClient);

const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(jobId, 'RUNNING', Date.now(), remoteDir, form);
const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
jobId, 'RUNNING', Date.now(), remoteDir, form, this.generateSequenceId()
);
jobDetail.rmMeta = rmMeta;
jobDetail.startTime = Date.now();
this.trialJobsMap.set(jobId, jobDetail);
Expand Down Expand Up @@ -592,6 +599,35 @@ class RemoteMachineTrainingService implements TrainingService {

await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient);
}

private generateSequenceId(): number {
return this.trialSequenceId++;
}

private async writeSequenceIdFile(trialJobId: string, rmMeta: RemoteMachineMeta): Promise<void> {
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
assert(false, `Can not get trial job detail for job: ${trialJobId}`);
} else {
await this.writeRemoteTrialFile(trialJobId, trialJobDetail.sequenceId.toString(), rmMeta, path.join('.nni', 'sequence_id'));
}
}

private async writeRemoteTrialFile(trialJobId: string, fileContent: string,
rmMeta: RemoteMachineMeta, fileName: string): Promise<void> {
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta);
if (sshClient === undefined) {
throw new Error('sshClient is undefined.');
}

const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

const localFilepath: string = path.join(trialLocalTempFolder, fileName);
await fs.promises.writeFile(localFilepath, fileContent, { encoding: 'utf8' });

await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient);
}
}

export { RemoteMachineTrainingService };
4 changes: 4 additions & 0 deletions src/sdk/pynni/nni/platform/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ def send_metric(string):
assert len(data) < 1000000, 'Metric too long'
_metric_file.write(b'ME%06d%b' % (len(data), data))
_metric_file.flush()

def get_sequence_id():
with open(os.path.join(_sysdir, '.nni', 'sequence_id'), 'r') as f:
return int(f.read().strip())
3 changes: 3 additions & 0 deletions src/sdk/pynni/nni/platform/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
def get_parameters():
pass

def get_sequence_id():
pass

def send_metric(string):
metric = json_tricks.loads(string)
if metric['type'] == 'FINAL':
Expand Down
4 changes: 4 additions & 0 deletions src/sdk/pynni/nni/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@
'get_parameters',
'report_intermediate_result',
'report_final_result',
'get_sequence_id'
]


_params = None
_sequence_id = platform.get_sequence_id()


def get_parameters():
Expand All @@ -45,6 +47,8 @@ def get_parameters():
def get_parameter(tag):
return get_parameters()[tag]

def get_sequence_id():
return _sequence_id

_intermediate_seq = 0

Expand Down

0 comments on commit 1388d76

Please sign in to comment.