diff --git a/src/kube-runtime/src/plugins/tensorboard/tensorboard.sh.template b/src/kube-runtime/src/plugins/tensorboard/tensorboard.sh.template index c15b2e7892..46bb47f114 100644 --- a/src/kube-runtime/src/plugins/tensorboard/tensorboard.sh.template +++ b/src/kube-runtime/src/plugins/tensorboard/tensorboard.sh.template @@ -21,10 +21,10 @@ set -o nounset set -o pipefail TENSORFLOW_VERSION=$(python -c 'import tensorflow as tf; print(tf.__version__)') -FIRST_VERSION_NUMBER=${TENSORFLOW_VERSION:0:1} -if [[ "$FIRST_VERSION_NUMBER" = "1" ]]; then +MAJOR_VERSION=${TENSORFLOW_VERSION:0:1} +if [[ "$MAJOR_VERSION" = "1" ]]; then tensorboard --logdir={{ logdir }} --port={{ port }} & -elif [[ "$FIRST_VERSION_NUMBER" = "2" ]]; then +elif [[ "$MAJOR_VERSION" = "2" ]]; then tensorboard --logdir={{ logdir }} --port={{ port }} --bind_all & else echo "Tensorflow version is ${TENSORFLOW_VERSION}, not support" diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index cb5b865a5d..bd81125864 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -337,7 +337,7 @@ const convertFrameworkDetail = async (framework) => { return detail; }; -const generateTaskRole = (frameworkName, taskRole, jobInfo, config, storageConfig, frameworkEnvList) => { +const generateTaskRole = (frameworkName, taskRole, jobInfo, frameworkEnvList, config, storageConfig) => { const ports = config.taskRoles[taskRole].resourcePerInstance.ports || {}; for (let port of ['ssh', 'http']) { if (!(port in ports)) { @@ -372,6 +372,21 @@ const generateTaskRole = (frameworkName, taskRole, jobInfo, config, storageConfi retryPolicy.maxRetryCount = config.taskRoles[taskRole].taskRetryCount || 0; } + const taskRoleEnvList = [ + { + name: 'PAI_CURRENT_TASK_ROLE_NAME', + value: taskRole, + }, + { + name: 'PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX', + valueFrom: { + fieldRef: { + fieldPath: `metadata.annotations['FC_TASK_INDEX']`, + }, + }, + }, + ]; + const frameworkTaskRole = { name: convertName(taskRole), taskNumber: config.taskRoles[taskRole].instances || 1, @@ -412,34 +427,12 @@ const generateTaskRole = (frameworkName, taskRole, jobInfo, config, storageConfi name: 'GANG_ALLOCATION', value: gangAllocation, }, - { - name: 'PAI_USER_NAME', - value: jobInfo.userName, - }, - { - name: 'PAI_JOB_NAME', - value: `${jobInfo.userName}~${jobInfo.jobName}`, - }, { name: 'STORAGE_CONFIGS', value: JSON.stringify(storageConfig), }, - { - name: 'PAI_TASK_ROLE_LIST', - value: Object.keys(config.taskRoles).join(','), - }, - { - name: 'PAI_CURRENT_TASK_ROLE_NAME', - value: taskRole, - }, - { - name: 'PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX', - valueFrom: { - fieldRef: { - fieldPath: `metadata.annotations['FC_TASK_INDEX']`, - }, - }, - }, + ...frameworkEnvList, + ...taskRoleEnvList, ], volumeMounts: [ { @@ -475,18 +468,7 @@ const generateTaskRole = (frameworkName, taskRole, jobInfo, config, storageConfi }, env: [ ...frameworkEnvList, - { - name: 'PAI_CURRENT_TASK_ROLE_NAME', - value: taskRole, - }, - { - name: 'PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX', - valueFrom: { - fieldRef: { - fieldPath: `metadata.annotations['FC_TASK_INDEX']`, - }, - }, - }, + ...taskRoleEnvList, // backward compatibility { name: 'PAI_TASK_INDEX', @@ -663,7 +645,7 @@ const generateFrameworkDescription = (frameworkName, virtualCluster, config, raw let totalGpuNumber = 0; for (let taskRole of Object.keys(config.taskRoles)) { totalGpuNumber += config.taskRoles[taskRole].resourcePerInstance.gpu * config.taskRoles[taskRole].instances; - const taskRoleDescription = generateTaskRole(frameworkName, taskRole, jobInfo, config, storageConfig, frameworkEnvList); + const taskRoleDescription = generateTaskRole(frameworkName, taskRole, jobInfo, frameworkEnvList, config, storageConfig); taskRoleDescription.task.pod.spec.priorityClassName = `${encodeName(frameworkName)}-priority`; frameworkDescription.spec.taskRoles.push(taskRoleDescription); }