Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Habana to main #1884

Merged
merged 38 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
ff64ff2
Added accelerator CRD (#1451)
Gkrumbach07 Jul 3, 2023
a6c7798
add copy to clipboard to k8 name popover
Gkrumbach07 Jul 20, 2023
4b2f50b
Merge pull request #1561 from Gkrumbach07/copy-tooltip
openshift-merge-robot Jul 26, 2023
50839ac
added gpu migration
Gkrumbach07 Jul 27, 2023
84c2231
added accelerator detection
Gkrumbach07 Jul 28, 2023
391cbca
added accelerator detection line
Gkrumbach07 Aug 1, 2023
26da289
fix error state in migration
Gkrumbach07 Aug 1, 2023
ab07f22
added more resource types
Gkrumbach07 Aug 1, 2023
9387956
added accelerator UI user flow
Gkrumbach07 Jun 30, 2023
871ca4e
Squashed commit of the following:
Gkrumbach07 Aug 7, 2023
34a2f1c
sqush
Gkrumbach07 Aug 7, 2023
3694f9e
Squashed commit of the following:
Gkrumbach07 Aug 7, 2023
7f0b159
Merge branch 'accelerator-support' into accelerator-cr
Gkrumbach07 Aug 8, 2023
5509fa9
Merge pull request #1555 from Gkrumbach07/accelerator-cr
openshift-merge-robot Aug 8, 2023
1a1da24
Merge pull request #1618 from Gkrumbach07/migration
openshift-merge-robot Aug 8, 2023
dba676e
Merge pull request #1628 from Gkrumbach07/accelerator-detection
openshift-merge-robot Aug 8, 2023
e5717c3
bug fixes
Gkrumbach07 Aug 8, 2023
607fe26
update wording
Gkrumbach07 Aug 10, 2023
55d3089
Merge pull request #1645 from Gkrumbach07/accelerator-support
openshift-merge-robot Aug 10, 2023
fc89a4e
Merge branch 'main' into accelerator-support
lucferbux Aug 10, 2023
c8f2767
fix lint errors
Gkrumbach07 Aug 10, 2023
e555aaf
Merge pull request #1668 from Gkrumbach07/accelerator-support
openshift-merge-robot Aug 10, 2023
095c09c
Added support for existing settings
Gkrumbach07 Aug 14, 2023
2e152ee
Merge pull request #1677 from Gkrumbach07/accelerator-support
openshift-merge-robot Sep 5, 2023
a4bb172
add rbac accelerator role
Gkrumbach07 Sep 5, 2023
2a300bc
revert add rbac accelerator role
Gkrumbach07 Sep 5, 2023
84ca02e
add rbac accelerator role
Gkrumbach07 Sep 5, 2023
1afaee2
Merge pull request #1753 from Gkrumbach07/revert-commit
openshift-merge-robot Sep 5, 2023
1fe5489
Merge pull request #1754 from Gkrumbach07/add-roles
openshift-merge-robot Sep 6, 2023
a92ab5b
accelerator minor bug fixes
Gkrumbach07 Sep 6, 2023
dcd23ec
Merge pull request #1764 from Gkrumbach07/minor-fixes
openshift-merge-robot Sep 8, 2023
e529890
fix detection logic
Gkrumbach07 Sep 26, 2023
49bfc75
Merge pull request #1865 from Gkrumbach07/fix-detection
openshift-merge-robot Sep 26, 2023
548d781
update cluster role
Gkrumbach07 Sep 28, 2023
f00119e
Merge pull request #1877 from Gkrumbach07/update-service-role
openshift-merge-robot Sep 28, 2023
ab90061
move from cluster role to role
Gkrumbach07 Sep 28, 2023
1216427
Merge pull request #1879 from Gkrumbach07/update-service-role
openshift-merge-robot Sep 28, 2023
de01574
Merge branch 'main' into f/accelerator-support
andrewballantyne Sep 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions backend/src/plugins/kube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { FastifyInstance } from 'fastify';
import * as jsYaml from 'js-yaml';
import * as k8s from '@kubernetes/client-node';
import { DEV_MODE } from '../utils/constants';
import { cleanupDSPSuffix, initializeWatchedResources } from '../utils/resourceUtils';
import { cleanupGPU, initializeWatchedResources } from '../utils/resourceUtils';
import { User } from '@kubernetes/client-node/dist/config_types';

const CONSOLE_CONFIG_YAML_FIELD = 'console-config.yaml';
Expand Down Expand Up @@ -85,12 +85,10 @@ export default fp(async (fastify: FastifyInstance) => {
// Initialize the watching of resources
initializeWatchedResources(fastify);

// TODO: Delete this code in the future once we have no customers using RHODS 1.19 / ODH 2.4.0
// Cleanup for display name suffix of [DSP]
cleanupDSPSuffix(fastify).catch((e) =>
cleanupGPU(fastify).catch((e) =>
fastify.log.error(
`Unable to fully cleanup project display name suffixes - Some projects may not appear in the dashboard UI. ${
e.response?.body?.message || e.message
`Unable to fully convert GPU to use accelerator profiles. ${
e.response?.body?.message || e.message || e
}`,
),
);
Expand Down
71 changes: 71 additions & 0 deletions backend/src/routes/api/accelerators/acceleratorUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { AcceleratorInfo, KubeFastifyInstance } from '../../../types';

const RESOURCE_TYPES = [
'cpu',
'memory',
'pods',
'ephemeral-storage',
'hugepages-1Gi',
'hugepages-2Mi',
'attachable-volumes-aws-ebs',
];

const getIdentifiersFromResources = (resources: { [key: string]: string } = {}) => {
return Object.entries(resources)
.filter(([key]) => !RESOURCE_TYPES.includes(key))
.reduce<{ [key: string]: number }>((identifiers, [key, value]) => {
identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value);
return identifiers;
}, {});
};

export const getAcceleratorNumbers = async (
fastify: KubeFastifyInstance,
): Promise<AcceleratorInfo> =>
fastify.kube.coreV1Api
.listNode()
.then((res) =>
res.body.items.reduce<AcceleratorInfo>(
(info, node) => {
// reduce resources down to just the accelerators and their counts
const allocatable = getIdentifiersFromResources(node.status.allocatable);
const capacity = getIdentifiersFromResources(node.status.capacity);

// update the max count for each accelerator
Object.entries(allocatable).forEach(
([key, value]) => (info.available[key] = Math.max(info.available[key] ?? 0, value)),
);

// update the total count for each accelerator
Object.entries(capacity).forEach(
([key, value]) => (info.total[key] = (info.total[key] ?? 0) + value),
);

// update the allocated count for each accelerator
Object.entries(capacity).forEach(
([key, value]) =>
(info.allocated[key] = (info.allocated[key] ?? 0) + value - (allocatable[key] ?? 0)),
);

// if any accelerators are available, the cluster is configured
const configured =
info.configured || Object.values(info.available).some((value) => value > 0);

return {
total: info.total,
available: info.available,
allocated: info.allocated,
configured,
};
},
{ configured: false, available: {}, total: {}, allocated: {} },
),
)
.catch((e) => {
fastify.log.error(
`A ${e.statusCode} error occurred when listing cluster nodes: ${
e.response?.body?.message || e.statusMessage
}`,
);
return { configured: false, available: {}, total: {}, allocated: {} };
});
11 changes: 11 additions & 0 deletions backend/src/routes/api/accelerators/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types';
import { getAcceleratorNumbers } from './acceleratorUtils';
import { logRequestDetails } from '../../../utils/fileUtils';

export default async (fastify: KubeFastifyInstance): Promise<void> => {
fastify.get('/', async (request: OauthFastifyRequest) => {
logRequestDetails(fastify, request);

return getAcceleratorNumbers(fastify);
});
};
7 changes: 7 additions & 0 deletions backend/src/routes/api/gpu/gpuUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = {
lastFetch: 0,
};

/**
* @deprecated - use getAcceleratorNumbers instead
*/
export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInfo> => {
if (storage.lastFetch >= Date.now() - 30_000) {
fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`);
Expand Down Expand Up @@ -67,11 +70,15 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInf
available: maxGpuNumber,
autoscalers: scalingLimit,
};

storage.lastFetch = Date.now();
storage.lastValue = data;
return data;
};

/**
* @deprecated
*/
export const getGPUData = async (
fastify: KubeFastifyInstance,
podIP: string,
Expand Down
3 changes: 3 additions & 0 deletions backend/src/routes/api/gpu/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types';
import { getGPUNumber } from './gpuUtils';
import { logRequestDetails } from '../../../utils/fileUtils';

/**
* @deprecated - use accelerators instead
*/
export default async (fastify: KubeFastifyInstance): Promise<void> => {
fastify.get('/', async (request: OauthFastifyRequest) => {
logRequestDetails(fastify, request);
Expand Down
51 changes: 45 additions & 6 deletions backend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ export type KubeDecorator = KubeStatus & {
customObjectsApi: k8s.CustomObjectsApi;
rbac: k8s.RbacAuthorizationV1Api;
currentToken: string;

};

export type KubeFastifyInstance = FastifyInstance & {
Expand Down Expand Up @@ -755,6 +756,14 @@ export type GPUInfo = {
available: number;
autoscalers: gpuScale[];
};

export type AcceleratorInfo = {
configured: boolean;
available: {[key: string]: number};
total: {[key: string]: number};
allocated: {[key: string]: number};
}

export type EnvironmentVariable = EitherNotBoth<
{ value: string | number },
{ valueFrom: Record<string, unknown> }
Expand Down Expand Up @@ -805,12 +814,17 @@ export type NotebookData = {
notebookSizeName: string;
imageName: string;
imageTagName: string;
gpus: number;
accelerator: AcceleratorState;
envVars: EnvVarReducedTypeKeyValues;
state: NotebookState;
username?: string;
};

export type AcceleratorState = {
accelerator?: AcceleratorKind;
count: number;
};

export const LIMIT_NOTEBOOK_IMAGE_GPU = 'nvidia.com/gpu';

type DisplayNameAnnotations = Partial<{
Expand Down Expand Up @@ -868,19 +882,21 @@ export type SupportedModelFormats = {
autoSelect?: boolean;
};

export type GPUCount = string | number;

export enum ContainerResourceAttributes {
CPU = 'cpu',
MEMORY = 'memory',
}

export type ContainerResources = {
requests?: {
cpu?: string | number;
memory?: string;
'nvidia.com/gpu'?: GPUCount;
};
} & Record<string, unknown>;
limits?: {
cpu?: string | number;
memory?: string;
'nvidia.com/gpu'?: GPUCount;
};
} & Record<string, unknown>;
};

export type ServingRuntime = K8sResourceCommon & {
Expand Down Expand Up @@ -908,3 +924,26 @@ export type ServingRuntime = K8sResourceCommon & {
volumes?: Volume[];
};
};

export type AcceleratorKind = K8sResourceCommon & {
metadata: {
name: string;
annotations?: Partial<{
'opendatahub.io/modified-date': string;
}>;
};
spec: {
displayName: string;
enabled: boolean;
identifier: string;
description?: string;
tolerations?: NotebookToleration[];
};
};

export enum KnownLabels {
DASHBOARD_RESOURCE = 'opendatahub.io/dashboard',
PROJECT_SHARING = 'opendatahub.io/project-sharing',
MODEL_SERVING_PROJECT = 'modelmesh-enabled',
DATA_CONNECTION_AWS = 'opendatahub.io/managed',
}
4 changes: 3 additions & 1 deletion backend/src/utils/constants.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import * as path from 'path';
import './dotenv';
import { DashboardConfig, NotebookSize } from '../types';
import { DashboardConfig, KnownLabels, NotebookSize } from '../types';

export const PORT = Number(process.env.PORT) || Number(process.env.BACKEND_PORT) || 8080;
export const IP = process.env.IP || '0.0.0.0';
Expand Down Expand Up @@ -134,3 +134,5 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [

export const imageUrlRegex =
/^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/;

export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`;
54 changes: 25 additions & 29 deletions backend/src/utils/notebookUtils.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import { getDashboardConfig } from './resourceUtils';
import {
ContainerResourceAttributes,
EnvironmentVariable,
ImageInfo,
ImageTag,
KubeFastifyInstance,
LIMIT_NOTEBOOK_IMAGE_GPU,
Notebook,
NotebookAffinity,
NotebookData,
Expand Down Expand Up @@ -156,7 +156,7 @@ export const assembleNotebook = async (
envName: string,
tolerationSettings: NotebookTolerationSettings,
): Promise<Notebook> => {
const { notebookSizeName, imageName, imageTagName, gpus, envVars } = data;
const { notebookSizeName, imageName, imageTagName, accelerator, envVars } = data;

const notebookSize = getNotebookSize(notebookSizeName);

Expand Down Expand Up @@ -191,40 +191,35 @@ export const assembleNotebook = async (
const resources: NotebookResources = { ...notebookSize.resources };
const tolerations: NotebookToleration[] = [];

let affinity: NotebookAffinity = {};
if (gpus > 0) {
const affinity: NotebookAffinity = {};
if (accelerator.count > 0 && accelerator.accelerator) {
if (!resources.limits) {
resources.limits = {};
}
if (!resources.requests) {
resources.requests = {};
}
resources.limits[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
resources.requests[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
tolerations.push({
effect: 'NoSchedule',
key: LIMIT_NOTEBOOK_IMAGE_GPU,
operator: 'Exists',
});
resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count;
resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count;
} else {
affinity = {
nodeAffinity: {
preferredDuringSchedulingIgnoredDuringExecution: [
{
preference: {
matchExpressions: [
{
key: 'nvidia.com/gpu.present',
operator: 'NotIn',
values: ['true'],
},
],
},
weight: 1,
},
],
},
};
// step type down to string to avoid type errors
const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes);

Object.keys(resources.limits || {}).forEach((key) => {
if (!containerResourceKeys.includes(key)) {
delete resources.limits?.[key];
}
});

Object.keys(resources.requests || {}).forEach((key) => {
if (!containerResourceKeys.includes(key)) {
delete resources.requests?.[key];
}
});
}

if (accelerator.accelerator?.spec.tolerations) {
tolerations.push(...accelerator.accelerator.spec.tolerations);
}

if (tolerationSettings?.enabled) {
Expand Down Expand Up @@ -272,6 +267,7 @@ export const assembleNotebook = async (
'notebooks.opendatahub.io/last-image-selection': imageSelection,
'opendatahub.io/username': username,
'kubeflow-resource-stopped': null,
'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '',
},
name: name,
namespace: namespace,
Expand Down
Loading
Loading