Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Support reuse mode for pipeline #4310

Merged
merged 67 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
bf41180
add pipeline for adl
SparkSnail Dec 25, 2020
c9f1fa1
fix agent pool
SparkSnail Dec 25, 2020
5c8655f
fix build
SparkSnail Dec 26, 2020
3aec741
fix build
SparkSnail Dec 26, 2020
a34cb20
fix build
SparkSnail Dec 26, 2020
650f9b0
fix build
SparkSnail Dec 26, 2020
a2965e7
fix build
SparkSnail Dec 26, 2020
bf5a0bc
fix sed command
SparkSnail Dec 26, 2020
8dbfb61
fix build
SparkSnail Dec 26, 2020
41d8844
fix docker image
SparkSnail Dec 26, 2020
5acad1d
fix docker
SparkSnail Dec 26, 2020
57c9be2
fix test cases in adl
SparkSnail Dec 27, 2020
a00fdc3
fix str
SparkSnail Dec 27, 2020
7a230ff
fix searchspace path
SparkSnail Dec 27, 2020
291507d
fix build
SparkSnail Dec 27, 2020
100e2ef
fix config file path
SparkSnail Dec 27, 2020
df81878
fix accessor test
SparkSnail Dec 27, 2020
ccc018e
add waittime
SparkSnail Dec 27, 2020
63c915b
add main_adl test case
SparkSnail Dec 27, 2020
33c15a2
fix build
SparkSnail Dec 27, 2020
09e95b7
fix build
SparkSnail Dec 27, 2020
29d2b61
Merge branch 'v2.0' of https://github.com/microsoft/nni into dev-pipe…
SparkSnail Dec 28, 2020
32b3242
fix comments
SparkSnail Dec 30, 2020
bdeadb7
fix conflict
SparkSnail Mar 25, 2021
36049d6
support aml pipeline
SparkSnail Mar 25, 2021
3c02c82
fix build
SparkSnail Apr 2, 2021
42793cd
fix build
SparkSnail Apr 2, 2021
09338e5
fix build
SparkSnail Apr 2, 2021
0c5a4ba
fix build
SparkSnail Apr 2, 2021
fde4fcc
fix build
SparkSnail Apr 2, 2021
1b41ed6
fix build
SparkSnail Apr 2, 2021
9acfe4e
fix build
SparkSnail Apr 2, 2021
9b15f30
fix build
SparkSnail Apr 2, 2021
cacac59
fix build
SparkSnail Apr 2, 2021
615eda2
fix build
SparkSnail Apr 2, 2021
79f3b6e
fix build
SparkSnail Apr 2, 2021
9d27257
fix build
SparkSnail Apr 2, 2021
fb9eb8d
Merge branch 'master' of https://github.com/microsoft/nni into dev-pi…
SparkSnail May 11, 2021
868758d
Merge branch 'master' of https://github.com/microsoft/nni into dev-pi…
SparkSnail May 12, 2021
1ca40f6
Merge branch 'master' of https://github.com/microsoft/nni into dev-pi…
SparkSnail Sep 18, 2021
61edf12
Dev fix aml (#4197)
SparkSnail Sep 18, 2021
7220d7b
upgrade stop logic
SparkSnail Sep 18, 2021
628a0b8
Merge branch 'dev-pipeline' of https://github.com/microsoft/nni into …
SparkSnail Sep 18, 2021
6e3b976
add log
SparkSnail Sep 18, 2021
1de5b85
fix update
SparkSnail Sep 18, 2021
b8ae8f5
add more log
SparkSnail Sep 18, 2021
9ec4df5
remove unused code
SparkSnail Sep 18, 2021
e727f7c
remove unused code
SparkSnail Sep 18, 2021
ead2a8f
add stop logic
SparkSnail Sep 18, 2021
28e7bd3
fix comments
SparkSnail Sep 24, 2021
9a50101
Merge branch 'master' of https://github.com/microsoft/nni into dev-pi…
SparkSnail Oct 29, 2021
8d7a9ad
debug
SparkSnail Oct 29, 2021
f24d733
debug
SparkSnail Nov 1, 2021
1075f38
add debug log
SparkSnail Nov 1, 2021
d6b11b6
debug local
SparkSnail Nov 1, 2021
853cd02
debug setting.json
SparkSnail Nov 1, 2021
9a99775
remove unused code
SparkSnail Nov 1, 2021
c79cc40
remove unused code
SparkSnail Nov 1, 2021
e024e39
remove blank
SparkSnail Nov 1, 2021
d6e2faa
fix tslint
SparkSnail Nov 1, 2021
14064cf
remove unused code
SparkSnail Nov 1, 2021
87bde98
fix build
SparkSnail Nov 10, 2021
a1a9156
fix build
SparkSnail Nov 12, 2021
b5e93bd
add frameworkcontroller
SparkSnail Nov 12, 2021
b2b5b44
Merge branch 'master' of https://github.com/microsoft/nni into dev-pi…
SparkSnail Nov 12, 2021
b828733
revert
SparkSnail Nov 12, 2021
1602793
remove unused change
SparkSnail Nov 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pipelines/integration-test-frameworkcontroller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,19 @@ jobs:
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread
displayName: Integration test

- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts frameworkcontroller \
--keyvault_vaultname $(keyvault_vaultname) \
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --reuse_mode True --exclude multi-phase,multi-thread
displayName: Integration test (reuse mode)
17 changes: 17 additions & 0 deletions pipelines/integration-test-kubeflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,20 @@ jobs:
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread
displayName: Integration test

- script: |
set -e
cd test
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
python3 nni_test/nnitest/generate_ts_config.py \
--ts kubeflow \
--keyvault_vaultname $(keyvault_vaultname) \``
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --reuse_mode True --exclude multi-phase,multi-thread
displayName: Integration test (reuse mode)
50 changes: 50 additions & 0 deletions test/config/training_service_v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,53 @@ hybrid:
resourceGroup:
workspaceName:
computeTarget:
kubeflow:
trialGpuNumber: 0
trialConcurrency: 2
maxTrialNumber: 2
nniManagerIp:
trainingService:
reuseMode: true
platform: kubeflow
worker:
command:
code_directory:
dockerImage:
cpuNumber: 1
gpuNumber: 0
memorySize: 8192
replicas: 1
operator: tf-operator
storage:
storageType: azureStorage
azureAccount:
azureShare:
keyVaultName:
keyVaultKey:
apiVersion: v1
frameworkcontroller:
trialGpuNumber: 0
trialConcurrency: 2
maxTrialNumber: 2
nniManagerIp:
trainingService:
reuseMode: true
platform: frameworkcontroller
serviceAccountName: frameworkcontroller
taskRoles:
- name: worker
dockerImage:
taskNumber: 1
command:
gpuNumber: 0
cpuNumber: 1
memorySize: 8192
framework_attempt_completion_policy:
min_failed_task_count: 1
minSucceedTaskCount: 1
storage:
storageType: azureStorage
azureAccount:
azureShare:
keyVaultName:
keyVaultKey:
23 changes: 21 additions & 2 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def update_training_service_config(args):
config[args.ts]['trial']['virtualCluster'] = args.vc
if args.debug is not None:
config[args.ts]['debug'] = args.debug.lower() == 'true'
elif args.ts == 'kubeflow':
elif args.ts == 'kubeflow' and args.reuse_mode == 'False':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need args.reuse_mode.lower() == 'false' like L37? either is fine for me.

if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
Expand All @@ -50,7 +50,16 @@ def update_training_service_config(args):
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller':
elif args.ts == 'kubeflow' and args.reuse_mode == 'True':
config = get_yml_content(TRAINING_SERVICE_FILE_V2)
config[args.ts]['trainingService']['worker']['dockerImage'] = args.nni_docker_image
config[args.ts]['trainingService']['storage']['azureAccount'] = args.azs_account
config[args.ts]['trainingService']['storage']['azureShare'] = args.azs_share
config[args.ts]['trainingService']['storage']['keyVaultName'] = args.keyvault_name
config[args.ts]['trainingService']['storage']['keyVaultKey'] = args.keyvault_vaultname
config[args.ts]['nni_manager_ip'] = args.nni_manager_ip
dump_yml_content(TRAINING_SERVICE_FILE_V2, config)
elif args.ts == 'frameworkcontroller' and args.reuse_mode == 'False':
if args.nfs_server is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
Expand All @@ -65,6 +74,15 @@ def update_training_service_config(args):
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['taskRoles'][0]['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller' and args.reuse_mode == 'True':
config = get_yml_content(TRAINING_SERVICE_FILE_V2)
config[args.ts]['trainingService']['taskRoles'][0]['dockerImage'] = args.nni_docker_image
config[args.ts]['trainingService']['storage']['azureAccount'] = args.azs_account
config[args.ts]['trainingService']['storage']['azureShare'] = args.azs_share
config[args.ts]['trainingService']['storage']['keyVaultName'] = args.keyvault_name
config[args.ts]['trainingService']['storage']['keyVaultKey'] = args.keyvault_vaultname
config[args.ts]['nni_manager_ip'] = args.nni_manager_ip
dump_yml_content(TRAINING_SERVICE_FILE_V2, config)
elif args.ts == 'remote':
if args.remote_user is not None:
config[args.ts]['machineList'][0]['username'] = args.remote_user
Expand Down Expand Up @@ -134,6 +152,7 @@ def update_training_service_config(args):
parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
parser.add_argument("--reuse_mode", type=str, default='False')
# args for remote with shared storage
parser.add_argument("--azurestoragetoken", type=str)
parser.add_argument("--nfs_server", type=str)
Expand Down
15 changes: 11 additions & 4 deletions test/nni_test/nnitest/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,27 @@
it_variables = {}


def update_training_service_config(config, training_service, config_file_path, nni_source_dir):
def update_training_service_config(config, training_service, config_file_path, nni_source_dir, reuse_mode='False'):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))

# hack for kubeflow trial config
if training_service == 'kubeflow':
if training_service == 'kubeflow' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
elif training_service == 'kubeflow' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config['trainingService']['worker']['command'] = config['trialCommand']

if training_service == 'frameworkcontroller':
if training_service == 'frameworkcontroller' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
elif training_service == 'frameworkcontroller' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config['trainingService']['taskRoles'][0]['command'] = config['trialCommand']

if training_service == 'adl':
# hack for adl trial config, codeDir in adl mode refers to path in container
Expand Down Expand Up @@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir)
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir, args.reuse_mode)

# generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp'
Expand Down Expand Up @@ -313,6 +319,7 @@ def run(args):
parser.add_argument("--nni_source_dir", type=str, default='../')
parser.add_argument("--cases", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--reuse_mode", type=str, default='False')
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai',
'kubeflow', 'frameworkcontroller', 'adl', 'aml', 'hybrid'], default='local')
args = parser.parse_args()
Expand Down