Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Fix localTrainingService cancel logic and nnictl logic (#334)
Browse files Browse the repository at this point in the history
Fix nnictl stop logic
Fix localTrainingService cancelJob logic
Show port information in "nnictl experiment list" cmd.
Show more information when config file validate failed.
Add nnictl detect adjacent port logic if the platform is pai
  • Loading branch information
SparkSnail authored Nov 9, 2018
1 parent 55493ed commit a3f48b8
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 12 deletions.
17 changes: 12 additions & 5 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,16 @@ class LocalTrainingService implements TrainingService {
while (!this.stopping) {
while (this.jobQueue.length !== 0) {
const trialJobId: string = this.jobQueue[0];
const [success, resource] = this.tryGetAvailableResource();
if (!success) {
break;
const trialJobDeatil = this.jobMap.get(trialJobId)
if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING'){
const [success, resource] = this.tryGetAvailableResource();
if (!success) {
break;
}
this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource);
}
this.occupyResource(resource);
this.jobQueue.shift();
await this.runTrialJob(trialJobId, resource);
}
await delay(5000);
}
Expand Down Expand Up @@ -249,6 +252,10 @@ class LocalTrainingService implements TrainingService {
if (trialJob === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
}
if (trialJob.pid === undefined){
this.setTrialJobStatus(trialJob, 'USER_CANCELED');
return;
}
if (trialJob.form.jobType === 'TRIAL') {
await tkill(trialJob.pid, 'SIGKILL');
} else if (trialJob.form.jobType === 'HOST') {
Expand Down
2 changes: 1 addition & 1 deletion tools/nni_cmd/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
'%s\n' \
'-----------------------------------------------------------------------\n'

EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s StartTime: %s EndTime: %s \n'
EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s Port: %s StartTime: %s EndTime: %s \n'

PACKAGE_REQUIREMENTS = {
'SMAC': 'smac_tuner'
Expand Down
9 changes: 8 additions & 1 deletion tools/nni_cmd/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,14 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
'''Run nni manager process'''
nni_config = Config(config_file_name)
if detect_port(port):
print_error('Port %s is used by another process, please reset the port!' % port)
print_error('Port %s is used by another process, please reset the port!\n' \
'You could use \'nnictl create --help\' to get help information' % port)
exit(1)

if platform == 'pai' and detect_port(int(port) + 1):
print_error('PAI mode need an additional adjacent port %d, and the port %d is used by another process!\n' \
'You could set another port to start experiment!\n' \
'You could use \'nnictl create --help\' to get help information' % ((int(port) + 1), (int(port) + 1)))
exit(1)

print_normal('Starting restful server...')
Expand Down
3 changes: 2 additions & 1 deletion tools/nni_cmd/launcher_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def validate_common_content(experiment_config):
experiment_config['machineList'][index]['port'] = 22

except Exception as exception:
raise Exception(exception)
print_error('Your config file is not correct, please check your config file content!\n%s' % exception)
exit(1)

def parse_tuner_content(experiment_config):
'''Validate whether tuner in experiment_config is valid'''
Expand Down
11 changes: 7 additions & 4 deletions tools/nni_cmd/nnictl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def check_experiment_id(args):
experiment_information = ""
for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1)
elif not running_experiment_list:
Expand Down Expand Up @@ -96,7 +96,7 @@ def parse_ids(args):
experiment_information = ""
for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1)
else:
Expand Down Expand Up @@ -184,7 +184,10 @@ def stop_experiment(args):
if running:
response = rest_delete(experiment_url(rest_port), 20)
if not response or not check_response(response):
print_error('Stop experiment failed!')
if response:
print_error(response.text)
else:
print_error('No response from restful server!')
stop_rest_result = False
#sleep to wait rest handler done
time.sleep(3)
Expand Down Expand Up @@ -365,7 +368,7 @@ def experiment_list(args):
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = ""
for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)

0 comments on commit a3f48b8

Please sign in to comment.