Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Fix OpenPAI IT (#4057)
Browse files Browse the repository at this point in the history
  • Loading branch information
liuzhe-lz authored Aug 11, 2021
1 parent 56da3c3 commit 76152d4
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 10 deletions.
11 changes: 6 additions & 5 deletions pipelines/integration-test-openpai-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ jobs:
--nni_docker_image nnidev/nni-nightly \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \
--container_nfs_mount_path $(container_nfs_mount_path) \
--nni_manager_ip $(manager_ip) \
--vc nni
--vc nni \
--debug true
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: Integration test
Expand All @@ -82,8 +83,8 @@ jobs:
--nni_docker_image nnidev/nni-nightly \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \
--container_nfs_mount_path $(container_nfs_mount_path) \
--nni_manager_ip $(manager_ip) \
--vc nni
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
Expand Down
3 changes: 3 additions & 0 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def update_training_service_config(args):
config[args.ts]['trial']['paiStorageConfigName'] = args.pai_storage_config_name
if args.vc is not None:
config[args.ts]['trial']['virtualCluster'] = args.vc
if args.debug is not None:
config[args.ts]['debug'] = args.debug.lower() == 'true'
elif args.ts == 'kubeflow':
if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
Expand Down Expand Up @@ -146,6 +148,7 @@ def update_training_service_config(args):
parser.add_argument("--pai_storage_config_name", type=str)
parser.add_argument("--nni_manager_nfs_mount_path", type=str)
parser.add_argument("--container_nfs_mount_path", type=str)
parser.add_argument("--debug", type=str)
# args for kubeflow and frameworkController
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
Expand Down
2 changes: 1 addition & 1 deletion ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,4 @@ export function flattenConfig<T>(config: ExperimentConfig, platform: string): T
Object.assign(flattened, config.trainingService);
}
return <T>flattened;
}
}
3 changes: 2 additions & 1 deletion ts/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class PAITrainingService implements TrainingService {
this.paiTokenUpdateInterval = 7200000; //2hours
this.log.info('Construct paiBase training service.');
this.config = flattenConfig(config, 'openpai');
this.versionCheck = !this.config.debug;
this.paiJobRestServer = new PAIJobRestServer(this);
this.paiToken = this.config.token;
this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http';
Expand All @@ -78,7 +79,7 @@ class PAITrainingService implements TrainingService {

private async copyTrialCode(): Promise<void> {
await validateCodeDir(this.config.trialCodeDirectory);
const nniManagerNFSExpCodeDir = path.join(this.config.trialCodeDirectory, this.experimentId, 'nni-code');
const nniManagerNFSExpCodeDir = path.join(this.config.localStorageMountPoint, this.experimentId, 'nni-code');
await execMkdir(nniManagerNFSExpCodeDir);
this.log.info(`Starting copy codeDir data from ${this.config.trialCodeDirectory} to ${nniManagerNFSExpCodeDir}`);
await execCopydir(this.config.trialCodeDirectory, nniManagerNFSExpCodeDir);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ class RouterTrainingService implements TrainingService {
const instance = new RouterTrainingService();
instance.log = getLogger('RouterTrainingService');
const platform = Array.isArray(config.trainingService) ? 'hybrid' : config.trainingService.platform;
if (platform === 'remote' && !(<RemoteConfig>config.trainingService).reuseMode) {
if (platform === 'remote' && (<RemoteConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new RemoteMachineTrainingService(config);
} else if (platform === 'openpai' && !(<OpenpaiConfig>config.trainingService).reuseMode) {
} else if (platform === 'openpai' && (<OpenpaiConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new PAITrainingService(config);
} else if (platform === 'kubeflow' && !(<KubeflowConfig>config.trainingService).reuseMode) {
} else if (platform === 'kubeflow' && (<KubeflowConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new KubeflowTrainingService();
} else {
instance.internalTrainingService = await TrialDispatcher.construct(config);
Expand Down

0 comments on commit 76152d4

Please sign in to comment.