Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #230

Merged
merged 4 commits into from
Feb 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/en_US/Tuner/HyperbandAdvisor.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Hyperband on NNI
[Hyperband][1] is a popular automl algorithm. The basic idea of Hyperband is that it creates several buckets, each bucket has `n` randomly generated hyperparameter configurations, each configuration uses `r` resource (e.g., epoch number, batch number). After the `n` configurations is finished, it chooses top `n/eta` configurations and runs them using increased `r*eta` resource. At last, it chooses the best configuration it has found so far.

## 2. Implementation with fully parallelism
Frist, this is an example of how to write an automl algorithm based on MsgDispatcherBase, rather than Tuner and Assessor. Hyperband is implemented in this way because it integrates the functions of both Tuner and Assessor, thus, we call it advisor.
First, this is an example of how to write an automl algorithm based on MsgDispatcherBase, rather than Tuner and Assessor. Hyperband is implemented in this way because it integrates the functions of both Tuner and Assessor, thus, we call it advisor.

Second, this implementation fully leverages Hyperband's internal parallelism. More specifically, the next bucket is not started strictly after the current bucket, instead, it starts when there is available resource.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ class PAIK8STrainingService extends PAITrainingService {

public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;

case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG:
this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService));
this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
Expand Down
20 changes: 11 additions & 9 deletions tools/nni_cmd/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,15 +407,8 @@ def setPathCheck(key):
}

machine_list_schema = {
Optional('machineList'):[Or({
'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535),
'username': setType('username', str),
'passwd': setType('passwd', str),
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
}, {
Optional('machineList'):[Or(
{
'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535),
'username': setType('username', str),
Expand All @@ -424,6 +417,15 @@ def setPathCheck(key):
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
},
{
'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535),
'username': setType('username', str),
'passwd': setType('passwd', str),
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
})]
}

Expand Down
4 changes: 3 additions & 1 deletion tools/nni_cmd/nnictl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,13 @@ def remote_clean(machine_list, experiment_id=None):
userName = machine.get('username')
host = machine.get('ip')
port = machine.get('port')
sshKeyPath = machine.get('sshKeyPath')
passphrase = machine.get('passphrase')
if experiment_id:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', experiment_id])
else:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments'])
sftp = create_ssh_sftp_client(host, port, userName, passwd)
sftp = create_ssh_sftp_client(host, port, userName, passwd, sshKeyPath, passphrase)
print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir))
remove_remote_directory(sftp, remote_dir)

Expand Down
8 changes: 6 additions & 2 deletions tools/nni_cmd/ssh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@ def copy_remote_directory_to_local(sftp, remote_path, local_path):
except Exception:
pass

def create_ssh_sftp_client(host_ip, port, username, password):
def create_ssh_sftp_client(host_ip, port, username, password, ssh_key_path, passphrase):
'''create ssh client'''
try:
paramiko = check_environment()
conn = paramiko.Transport(host_ip, port)
conn.connect(username=username, password=password)
if ssh_key_path is not None:
ssh_key = paramiko.RSAKey.from_private_key_file(ssh_key_path, password=passphrase)
conn.connect(username=username, pkey=ssh_key)
else:
conn.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(conn)
return sftp
except Exception as exception:
Expand Down
6 changes: 4 additions & 2 deletions tools/nni_cmd/tensorboard_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ def copy_data_from_remote(args, nni_config, trial_content, path_list, host_list,
machine_dict = {}
local_path_list = []
for machine in machine_list:
machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username']}
machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username'],
'sshKeyPath': machine.get('sshKeyPath'), 'passphrase': machine.get('passphrase')}
for index, host in enumerate(host_list):
local_path = os.path.join(temp_nni_path, trial_content[index].get('id'))
local_path_list.append(local_path)
print_normal('Copying log data from %s to %s' % (host + ':' + path_list[index], local_path))
sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd'])
sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd'],
machine_dict[host]['sshKeyPath'], machine_dict[host]['passphrase'])
copy_remote_directory_to_local(sftp, path_list[index], local_path)
print_normal('Copy done!')
return local_path_list
Expand Down