Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #128

Merged
merged 1 commit into from
Feb 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions test/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@
import traceback

from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
parse_max_duration_time, get_succeeded_trial_num, print_stderr
parse_max_duration_time, get_succeeded_trial_num, print_stderr, deep_update
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL


def gen_new_config(config_file, training_service='local'):
'''
'''
Generates temporary config file for integration test, the file
should be deleted after testing.
'''
Expand All @@ -41,7 +40,15 @@ def gen_new_config(config_file, training_service='local'):
ts = get_yml_content('training_service.yml')[training_service]
print(config)
print(ts)
config.update(ts)

# hack for kubeflow trial config
if training_service == 'kubeflow':
ts['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')

deep_update(config, ts)
print(config)
dump_yml_content(new_config_file, config)

Expand All @@ -61,7 +68,7 @@ def run_test(config_file, training_service, local_gpu=False):
proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

max_duration, max_trial_num = get_max_values(config_file)
max_duration, max_trial_num = get_max_values(new_config_file)
sleep_interval = 3

for _ in range(0, max_duration+30, sleep_interval):
Expand Down Expand Up @@ -90,6 +97,12 @@ def run(args):
config_files = glob.glob('./config_test/**/*.test.yml')
else:
config_files = args.config.split(',')

if args.exclude is not None:
exclude_paths = args.exclude.split(',')
if exclude_paths:
for exclude_path in exclude_paths:
config_files = [x for x in config_files if exclude_path not in x]
print(config_files)

for config_file in config_files:
Expand All @@ -107,11 +120,10 @@ def run(args):
subprocess.run(['nnictl', 'stop'])

if __name__ == '__main__':
import tensorflow as tf
print('TF VERSION:', tf.__version__)
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai'], default='local')
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
Expand Down
81 changes: 81 additions & 0 deletions test/generate_ts_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import argparse
from utils import get_yml_content, dump_yml_content

TRAINING_SERVICE_FILE = 'training_service.yml'

def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
if args.nni_manager_ip is not None:
config[args.ts]['nniManagerIp'] = args.nni_manager_ip
if args.ts == 'pai':
if args.pai_user is not None:
config[args.ts]['paiConfig']['userName'] = args.pai_user
if args.pai_pwd is not None:
config[args.ts]['paiConfig']['passWord'] = args.pai_pwd
if args.pai_host is not None:
config[args.ts]['paiConfig']['host'] = args.pai_host
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.data_dir is not None:
config[args.ts]['trial']['dataDir'] = args.data_dir
if args.output_dir is not None:
config[args.ts]['trial']['outputDir'] = args.output_dir
elif args.ts == 'kubeflow':
if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
config[args.ts]['kubeflowConfig']['nfs']['path'] = args.nfs_path
if args.keyvault_vaultname is not None:
config[args.ts]['kubeflowConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
if args.keyvault_name is not None:
config[args.ts]['kubeflowConfig']['keyVault']['name'] = args.keyvault_name
if args.azs_account is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['accountName'] = args.azs_account
if args.azs_share is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image

dump_yml_content(TRAINING_SERVICE_FILE, config)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
parser.add_argument("--pai_user", type=str)
parser.add_argument("--pai_pwd", type=str)
parser.add_argument("--pai_host", type=str)
parser.add_argument("--data_dir", type=str)
parser.add_argument("--output_dir", type=str)
# args for kubeflow
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
parser.add_argument("--keyvault_name", type=str)
parser.add_argument("--azs_account", type=str)
parser.add_argument("--azs_share", type=str)
args = parser.parse_args()

update_training_service_config(args)
52 changes: 52 additions & 0 deletions test/pipelines-it-kubeflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
jobs:
- job: 'integration_test_kubeflow'
pool: 'NNI CI KUBE CLI'

variables:
new_docker_img: msranni/nni.it.kb:latest

steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'

- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'

- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)

echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'

- script: |
source install.sh
displayName: 'Install nni toolkit via source code'

- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)

cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase
displayName: 'integration test'
51 changes: 51 additions & 0 deletions test/pipelines-it-pai.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
jobs:
- job: 'integration_test_pai'
pool: 'NNI CI PAI CLI'

variables:
new_docker_img: msranni/nni.it.pai:latest

steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'

- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'

- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)

echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'

- script: |
source install.sh
displayName: 'Install nni toolkit via source code'

- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
--nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)

PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase
displayName: 'integration test'
61 changes: 42 additions & 19 deletions test/training_service.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,47 @@
kubeflow:
maxExecDuration: 15m
nniManagerIp:
kubeflowConfig:
operator: tf-operator
apiVersion: v1alpha2
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
trial:
worker:
replicas: 1
command:
gpuNum: 1
cpuNum: 1
memoryMB: 8192
image:
trainingServicePlatform: kubeflow

local:
trainingServicePlatform: local

remote:
trainingServicePlatform: remote
machineList:
- ip:
port:
username:
passwd:

pai:
trainingServicePlatform: pai
nniManagerIp:
maxExecDuration: 15m
paiConfig:
userName:
passWord:
host:
host:
passWord:
userName:
trainingServicePlatform: pai
trial:
gpuNum:
cpuNum:
memoryMB:
image: msranni/latest
dataDir:
outputDir:
gpuNum: 1
cpuNum: 1
dataDir:
image:
memoryMB: 8192
outputDir:
remote:
machineList:
- ip:
passwd:
port:
username:
trainingServicePlatform: remote
14 changes: 14 additions & 0 deletions test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import contextlib
import collections
import json
import os
import subprocess
Expand Down Expand Up @@ -118,3 +119,16 @@ def parse_max_duration_time(max_exec_duration):
time = max_exec_duration[:-1]
units_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
return int(time) * units_dict[unit]

def deep_update(source, overrides):
"""Update a nested dictionary or similar mapping.

Modify ``source`` in place.
"""
for key, value in overrides.items():
if isinstance(value, collections.Mapping) and value:
returned = deep_update(source.get(key, {}), value)
source[key] = returned
else:
source[key] = overrides[key]
return source