Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #96 from Microsoft/master
Browse files Browse the repository at this point in the history
merge master
  • Loading branch information
SparkSnail authored Dec 17, 2018
2 parents 84a5db5 + 9397b6f commit 6cf294e
Show file tree
Hide file tree
Showing 14 changed files with 131 additions and 31 deletions.
4 changes: 4 additions & 0 deletions docs/HowToChooseTuner.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ _Usage_:
Note that SMAC on nni only supports a subset of the types in [search space spec](./SearchSpaceSpec.md), including `choice`, `randint`, `uniform`, `loguniform`, `quniform(q=1)`.

_Installation_:
* Install swig first. (`sudo apt-get install swig` for Ubuntu users)
* Run `nnictl package install --name=SMAC`

_Suggested scenario_: Similar to TPE, SMAC is also a black-box tuner which can be tried in various scenarios, and is suggested when computation resource is limited. It is optimized for discrete hyperparameters, thus, suggested when most of your hyperparameters are discrete.

_Usage_:
Expand Down
Binary file modified docs/img/nni_arch_overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion examples/trials/auto-gbdt/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ tuner:
trial:
command: python3 main.py
codeDir: .
gpuNum: 0
gpuNum: 0
2 changes: 1 addition & 1 deletion examples/trials/mnist-annotation/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self,
"""@nni.variable(nni.choice(124, 512, 1024), name=self.hidden_size)"""
self.hidden_size = hidden_size
self.pool_size = pool_size
"""@nni.variable(nni.uniform(0.0001, 0.1), name=self.learning_rate)"""
"""@nni.variable(nni.loguniform(0.0001, 0.1), name=self.learning_rate)"""
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ interface TrialJobStatistics {
}

interface NNIManagerStatus {
status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL';
status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL';
errors: string[];
}

Expand Down
18 changes: 9 additions & 9 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class NNIManager implements Manager {
checkpointDir);

this.experimentProfile.startTime = Date.now();
this.status.status = 'EXPERIMENT_RUNNING';
this.status.status = 'RUNNING';
await this.storeExperimentProfile();
this.run().catch((err: Error) => {
this.criticalError(err);
Expand Down Expand Up @@ -185,7 +185,7 @@ class NNIManager implements Manager {
this.experimentProfile.endTime) {
delete this.experimentProfile.endTime;
}
this.status.status = 'EXPERIMENT_RUNNING';
this.status.status = 'RUNNING';

// TO DO: update database record for resume event
this.run().catch((err: Error) => {
Expand Down Expand Up @@ -350,7 +350,7 @@ class NNIManager implements Manager {
let count: number = 1;
while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') {
await delay(1000 * 1); // 1 seconds
if (this.status.status === 'EXPERIMENT_RUNNING') {
if (this.status.status === 'RUNNING') {
this.experimentProfile.execDuration += 1;
if (count % 10 === 0) {
await this.storeExperimentProfile();
Expand Down Expand Up @@ -460,15 +460,15 @@ class NNIManager implements Manager {
}

// check maxtrialnum and maxduration here
// NO_MORE_TRIAL is more like a subset of EXPERIMENT_RUNNING, because during EXPERIMENT_RUNNING tuner
// NO_MORE_TRIAL is more like a subset of RUNNING, because during RUNNING tuner
// might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed
// as still running. DONE could be transfered from EXPERIMENT_RUNNING or NO_MORE_TRIAL.
assert(this.status.status === 'EXPERIMENT_RUNNING' ||
// as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL.
assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL');
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status === 'EXPERIMENT_RUNNING' ||
if (this.status.status === 'RUNNING' ||
this.status.status === 'NO_MORE_TRIAL') {
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
Expand All @@ -480,7 +480,7 @@ class NNIManager implements Manager {
await this.storeExperimentProfile();
}
if (this.status.status !== 'NO_MORE_TRIAL') {
this.status.status = 'EXPERIMENT_RUNNING';
this.status.status = 'RUNNING';
}
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
if (this.waitingTrials.length === 0 ||
Expand Down Expand Up @@ -602,7 +602,7 @@ class NNIManager implements Manager {
case NEW_TRIAL_JOB:
if (this.status.status === 'NO_MORE_TRIAL') {
this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
this.status.status = 'EXPERIMENT_RUNNING';
this.status.status = 'RUNNING';
}
this.waitingTrials.push(content);
break;
Expand Down
5 changes: 4 additions & 1 deletion src/nni_manager/rest_server/nniRestServer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@

'use strict';

import * as express from 'express';
import * as bodyParser from 'body-parser';
import * as express from 'express';
import * as path from 'path';
import * as component from '../common/component';
import { RestServer } from '../common/restServer'
import { getLogDir } from '../common/utils';
import { createRestHandler } from './restHandler';

/**
Expand All @@ -35,6 +36,7 @@ import { createRestHandler } from './restHandler';
@component.Singleton
export class NNIRestServer extends RestServer {
private readonly API_ROOT_URL: string = '/api/v1/nni';
private readonly LOGS_ROOT_URL: string = '/logs';

/**
* constructor to provide NNIRestServer's own rest property, e.g. port
Expand All @@ -50,6 +52,7 @@ export class NNIRestServer extends RestServer {
this.app.use(express.static('static'));
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, createRestHandler(this));
this.app.use(this.LOGS_ROOT_URL, express.static(getLogDir()));
this.app.get('*', (req: express.Request, res: express.Response) => {
res.sendFile(path.resolve('static/index.html'));
});
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/rest_server/test/mockedNNIManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ export const testManagerProvider: Provider = {
export class MockedNNIManager extends Manager {
public getStatus(): NNIManagerStatus {
return {
status: 'EXPERIMENT_RUNNING',
status: 'RUNNING',
errors: []
}
}
Expand Down
26 changes: 26 additions & 0 deletions src/nni_manager/training_service/pai/hdfsClientUtility.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,38 @@
import * as path from 'path';
import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger } from '../../common/log';

/**
* HDFS client utility, including copy file/directory
*/
export namespace HDFSClientUtility {
/**
* Get NNI experiment root directory
* @param hdfsUserName HDFS user name
*/
function hdfsExpRootDir(hdfsUserName: string): string {
return path.join('/', hdfsUserName, 'nni', 'experiments', getExperimentId());
}

/**
* Get NNI experiment code directory
* @param hdfsUserName HDFS user name
*/
export function getHdfsExpCodeDir(hdfsUserName: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'codeDir');
}

/**
* Get NNI trial working directory
* @param hdfsUserName HDFS user name
* @param trialId NNI trial ID
*/
export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'trials', trialId);
}

/**
* Copy a local file to hdfs directory
*
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/training_service/pai/paiData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10}`;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}'`;

export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
Expand Down
30 changes: 21 additions & 9 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { HDFSClientUtility } from './hdfsClientUtility'
import { HDFSClientUtility } from './hdfsClientUtility';
import { MethodNotImplementedError } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, NNIManagerIpConfig
} from '../../common/trainingService';
import { countFilesRecursively, delay, generateParamFileName,
import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
Expand Down Expand Up @@ -74,6 +74,7 @@ class PAITrainingService implements TrainingService {
private nextTrialSequenceId: number;
private paiRestServerPort?: number;
private nniManagerIpConfig?: NNIManagerIpConfig;
private copyExpCodeDirPromise?: Promise<void>;

constructor() {
this.log = getLogger();
Expand Down Expand Up @@ -145,11 +146,11 @@ class PAITrainingService implements TrainingService {
throw new Error('PAI token is not initialized');
}

if(!this.hdfsBaseDir){
if(!this.hdfsBaseDir) {
throw new Error('hdfsBaseDir is not initialized');
}

if(!this.hdfsOutputHost){
if(!this.hdfsOutputHost) {
throw new Error('hdfsOutputHost is not initialized');
}

Expand All @@ -160,15 +161,19 @@ class PAITrainingService implements TrainingService {

this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);

// Make sure experiment code files is copied from local to HDFS
if(this.copyExpCodeDirPromise) {
await this.copyExpCodeDirPromise;
}

const trialJobId: string = uniqueString(5);
const trialSequenceId: number = this.generateSequenceId();
//TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);

const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);

const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
Expand All @@ -182,8 +187,8 @@ class PAITrainingService implements TrainingService {
}

// Step 1. Prepare PAI job configuration
const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);

const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsLogPath : string = String.Format(
Expand Down Expand Up @@ -215,7 +220,8 @@ class PAITrainingService implements TrainingService {
this.paiRestServerPort,
hdfsOutputDir,
this.hdfsOutputHost,
this.paiClusterConfig.userName
this.paiClusterConfig.userName,
HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName)
).replace(/\r\n|\n|\r/gm, '');

console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
Expand Down Expand Up @@ -390,6 +396,7 @@ class PAITrainingService implements TrainingService {
}

this.hdfsOutputHost = groups['host'];
//TODO: choose to use /${username} as baseDir
this.hdfsBaseDir = groups['baseDir'];
if(this.hdfsBaseDir === undefined) {
this.hdfsBaseDir = "/";
Expand All @@ -414,6 +421,11 @@ class PAITrainingService implements TrainingService {
} catch(error) {
deferred.reject(new Error(`HDFS encounters problem, error is ${error}. Please check hdfsOutputDir host!`));
}

// Copy experiment files from local folder to HDFS
this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs(this.paiTrialConfig.codeDir,
HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName),
this.hdfsClient);

deferred.resolve();
break;
Expand Down
1 change: 1 addition & 0 deletions src/sdk/pynni/nni/hyperband_advisor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ If you use Hyperband, among the hyperparameters (i.e., key-value pairs) received
`eta` means `n/eta` configurations from `n` configurations will survive and rerun using more STEPS.

Here is a concrete example of `R=81` and `eta=3`:

| | s=4 | s=3 | s=2 | s=1 | s=0 |
|------|-----|-----|-----|-----|-----|
|i | n r | n r | n r | n r | n r |
Expand Down
55 changes: 50 additions & 5 deletions tools/nni_trial_tool/hdfsClientUtility.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,55 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import os
import posixpath
from pyhdfs import HdfsClient
from .log_utils import LogType, nni_log

def copyHdfsDirectoryToLocal(hdfsDirectory, localDirectory, hdfsClient):
'''Copy directory from HDFS to local'''
if not os.path.exists(localDirectory):
os.makedirs(localDirectory)
try:
listing = hdfsClient.list_status(hdfsDirectory)
except Exception as exception:
nni_log(LogType.Error, 'List hdfs directory {0} error: {1}'.format(hdfsDirectory, str(exception)))
raise exception

for f in listing:
if f.type == 'DIRECTORY':
subHdfsDirectory = posixpath.join(hdfsDirectory, f.pathSuffix)
subLocalDirectory = os.path.join(localDirectory, f.pathSuffix)
copyHdfsDirectoryToLocal(subHdfsDirectory, subLocalDirectory, hdfsClient)
elif f.type == 'FILE':
hdfsFilePath = posixpath.join(hdfsDirectory, f.pathSuffix)
localFilePath = os.path.join(localDirectory, f.pathSuffix)
copyHdfsFileToLocal(hdfsFilePath, localFilePath, hdfsClient)
else:
raise AssertionError('unexpected type {}'.format(f.type))

def copyHdfsFileToLocal(hdfsFilePath, localFilePath, hdfsClient, override=True):
'''Copy file from HDFS to local'''
if not hdfsClient.exists(hdfsFilePath):
raise Exception('HDFS file {} does not exist!'.format(hdfsFilePath))
try:
file_status = hdfsClient.get_file_status(hdfsFilePath)
if file_status.type != 'FILE':
raise Exception('HDFS file path {} is not a file'.format(hdfsFilePath))
except Exception as exception:
nni_log(LogType.Error, 'Get hdfs file {0} status error: {1}'.format(hdfsFilePath, str(exception)))
raise exception

if os.path.exists(localFilePath) and override:
os.remove(localFilePath)
try:
hdfsClient.copy_to_local(hdfsFilePath, localFilePath)
except Exception as exception:
nni_log(LogType.Error, 'Copy hdfs file {0} to {1} error: {2}'.format(hdfsFilePath, localFilePath, str(exception)))
raise exception
nni_log(LogType.Info, 'Successfully copied hdfs file {0} to {1}, {2} bytes'.format(hdfsFilePath, localFilePath, file_status.length))

def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient):
'''Copy directory from local to hdfs'''
'''Copy directory from local to HDFS'''
if not os.path.exists(localDirectory):
raise Exception('Local Directory does not exist!')
hdfsClient.mkdirs(hdfsDirectory)
Expand All @@ -34,19 +79,19 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient):
try:
result = result and copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient)
except Exception as exception:
print(exception)
nni_log(LogType.Error, 'Copy local directory {0} to hdfs directory {1} error: {2}'.format(file_path, hdfs_directory, str(exception)))
result = False
else:
hdfs_file_path = os.path.join(hdfsDirectory, file)
try:
result = result and copyFileToHdfs(file_path, hdfs_file_path, hdfsClient)
except Exception as exception:
print(exception)
nni_log(LogType.Error, 'Copy local file {0} to hdfs {1} error: {2}'.format(file_path, hdfs_file_path, str(exception)))
result = False
return result

def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True):
'''Copy a local file to hdfs directory'''
'''Copy a local file to HDFS directory'''
if not os.path.exists(localFilePath):
raise Exception('Local file Path does not exist!')
if os.path.isdir(localFilePath):
Expand All @@ -60,5 +105,5 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True):
hdfsClient.copy_from_local(localFilePath, hdfsFilePath)
return True
except Exception as exception:
print(exception)
nni_log(LogType.Error, 'Copy local file {0} to hdfs file {1} error: {2}'.format(localFilePath, hdfsFilePath, str(exception)))
return False
Loading

0 comments on commit 6cf294e

Please sign in to comment.