Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
[Rest Server] Update job priority class (#4131)
Browse files Browse the repository at this point in the history
Update job priority class

* add switch to enable priority class for job FIFO
* remove framework owner reference for priority class, ref #4117
  • Loading branch information
abuccts authored Jan 10, 2020
1 parent 4e102ec commit 2ecfbde
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 38 deletions.
1 change: 1 addition & 0 deletions src/rest-server/config/rest-server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ github-owner: Microsoft
github-repository: pai
github-path: marketplace
debugging-reservation-seconds: 604800
enable-priority-class: false
1 change: 1 addition & 0 deletions src/rest-server/config/rest_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def run(self):
service_object_model['github-repository'] = self.service_configuration['github-repository']
service_object_model['github-path'] = self.service_configuration['github-path']
service_object_model['debugging-reservation-seconds'] = self.service_configuration['debugging-reservation-seconds']
service_object_model['enable-priority-class'] = self.service_configuration['enable-priority-class']
service_object_model['etcd-uris'] = ','.join('http://{0}:4001'.format(host['hostip'])
for host in machine_list
if host.get('k8s-role') == 'master')
Expand Down
2 changes: 2 additions & 0 deletions src/rest-server/deploy/rest-server.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ spec:
- name: LAUNCHER_TYPE
value: {{ cluster_cfg["cluster"]["common"]["cluster-type"] }}
{%- if cluster_cfg["cluster"]["common"]["cluster-type"] == "k8s" %}
- name: LAUNCHER_PRIORITY_CLASS
value: "{{ cluster_cfg['rest-server']['enable-priority-class'] }}"
- name: LAUNCHER_RUNTIME_IMAGE
value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}kube-runtime:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}
- name: LAUNCHER_RUNTIME_IMAGE_PULL_SECRETS
Expand Down
3 changes: 3 additions & 0 deletions src/rest-server/src/config/launcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ const k8sLauncherConfigSchema = Joi.object().keys({
hivedWebserviceUri: Joi.string()
.uri()
.required(),
enabledPriorityClass: Joi.boolean()
.required(),
apiVersion: Joi.string()
.required(),
podGracefulDeletionTimeoutSec: Joi.number()
Expand Down Expand Up @@ -198,6 +200,7 @@ if (launcherType === 'yarn') {
} else if (launcherType === 'k8s') {
launcherConfig = {
hivedWebserviceUri: process.env.HIVED_WEBSERVICE_URI,
enabledPriorityClass: process.env.LAUNCHER_PRIORITY_CLASS === 'true',
apiVersion: 'frameworkcontroller.microsoft.com/v1',
podGracefulDeletionTimeoutSec: 1800,
scheduler: process.env.LAUNCHER_SCHEDULER,
Expand Down
53 changes: 15 additions & 38 deletions src/rest-server/src/models/v2/job/k8s.js
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,9 @@ const generateFrameworkDescription = (frameworkName, virtualCluster, config, raw
for (let taskRole of Object.keys(config.taskRoles)) {
totalGpuNumber += config.taskRoles[taskRole].resourcePerInstance.gpu * config.taskRoles[taskRole].instances;
const taskRoleDescription = generateTaskRole(frameworkName, taskRole, jobInfo, frameworkEnvList, config, storageConfig);
taskRoleDescription.task.pod.spec.priorityClassName = `${encodeName(frameworkName)}-priority`;
if (launcherConfig.enabledPriorityClass) {
taskRoleDescription.task.pod.spec.priorityClassName = `${encodeName(frameworkName)}-priority`;
}
frameworkDescription.spec.taskRoles.push(taskRoleDescription);
}
frameworkDescription.metadata.annotations.totalGpuNumber = `${totalGpuNumber}`;
Expand Down Expand Up @@ -694,32 +696,6 @@ const createPriorityClass = async (frameworkName, priority) => {
}
};

const patchPriorityClassOwner = async (frameworkName, frameworkUid) => {
try {
const headers = {...launcherConfig.requestHeaders};
headers['Content-Type'] = 'application/merge-patch+json';
await k8sModel.getClient().request({
method: 'patch',
url: launcherConfig.priorityClassPath(`${encodeName(frameworkName)}-priority`),
headers,
data: {
metadata: {
ownerReferences: [{
apiVersion: launcherConfig.apiVersion,
kind: 'Framework',
name: encodeName(frameworkName),
uid: frameworkUid,
controller: false,
blockOwnerDeletion: false,
}],
},
},
});
} catch (error) {
logger.warn('Failed to patch owner reference for priority class', error);
}
};

const deletePriorityClass = async (frameworkName) => {
try {
await k8sModel.getClient().delete(
Expand Down Expand Up @@ -896,15 +872,17 @@ const put = async (frameworkName, config, rawConfig) => {

// calculate pod priority
// reference: https://github.com/microsoft/pai/issues/3704
let jobPriority = 0;
if (launcherConfig.enabledHived) {
jobPriority = parseInt(Object.values(config.taskRoles)[0].hivedPodSpec.priority);
jobPriority = Math.min(Math.max(jobPriority, -1), 126);
if (launcherConfig.enabledPriorityClass) {
let jobPriority = 0;
if (launcherConfig.enabledHived) {
jobPriority = parseInt(Object.values(config.taskRoles)[0].hivedPodSpec.priority);
jobPriority = Math.min(Math.max(jobPriority, -1), 126);
}
const jobCreationTime = Math.floor(new Date() / 1000) & (Math.pow(2, 23) - 1);
const podPriority = - (((126 - jobPriority) << 23) + jobCreationTime);
// create priority class
await createPriorityClass(frameworkName, podPriority);
}
const jobCreationTime = Math.floor(new Date() / 1000) & (Math.pow(2, 23) - 1);
const podPriority = - (((126 - jobPriority) << 23) + jobCreationTime);
// create priority class
await createPriorityClass(frameworkName, podPriority);

// send request to framework controller
let response;
Expand All @@ -921,19 +899,18 @@ const put = async (frameworkName, config, rawConfig) => {
} else {
// do not await for delete
auths.length && deleteSecret(frameworkName);
deletePriorityClass(frameworkName);
launcherConfig.enabledPriorityClass && deletePriorityClass(frameworkName);
throw error;
}
}
if (response.status !== status('Created')) {
// do not await for delete
auths.length && deleteSecret(frameworkName);
deletePriorityClass(frameworkName);
launcherConfig.enabledPriorityClass && deletePriorityClass(frameworkName);
throw createError(response.status, 'UnknownError', response.data.message);
}
// do not await for patch
auths.length && patchSecretOwner(frameworkName, response.data.metadata.uid);
patchPriorityClassOwner(frameworkName, response.data.metadata.uid);
};

const execute = async (frameworkName, executionType) => {
Expand Down

0 comments on commit 2ecfbde

Please sign in to comment.