From ff64ff29842278bdb97ed8ab5630ce0630b2ebbf Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Mon, 3 Jul 2023 07:15:51 -0500 Subject: [PATCH 01/22] Added accelerator CRD (#1451) * added cr * added accelerator profile crd * added to kustomize --- ...cceleratorprofiles.opendatahub.io.crd.yaml | 39 +++++++++++++++++++ manifests/crd/kustomization.yaml | 1 + 2 files changed, 40 insertions(+) create mode 100644 manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml diff --git a/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml new file mode 100644 index 0000000000..3d34c5830b --- /dev/null +++ b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml @@ -0,0 +1,39 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: acceleratorprofiles.opendatahub.io +spec: + group: opendatahub.io + scope: Namespaced + names: + plural: acceleratorprofiles + singular: acceleratorprofile + kind: AcceleratorProfile + versions: + - name: v1alpha + served: true + storage: true + schema: + openAPIV3Schema: + type: object + required: + - spec + properties: + spec: + type: object + required: + - display-name + - enabled + - identifier + properties: + display-name: + type: string + enabled: + type: boolean + identifier: + type: string + description: + type: string + modifiedDate: + type: string + \ No newline at end of file diff --git a/manifests/crd/kustomization.yaml b/manifests/crd/kustomization.yaml index 7709378df1..3d8497d804 100644 --- a/manifests/crd/kustomization.yaml +++ b/manifests/crd/kustomization.yaml @@ -8,3 +8,4 @@ resources: - odhquickstarts.console.openshift.io.crd.yaml - odhdocuments.dashboard.opendatahub.io.crd.yaml - odhapplications.dashboard.opendatahub.io.crd.yaml +- acceleratorprofiles.opendatahub.io.crd.yaml \ No newline at end of file From a6c77985b66450b8a187d380616dcc5048f39794 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 20 Jul 2023 14:33:21 -0500 Subject: [PATCH 02/22] add copy to clipboard to k8 name popover --- .../src/components/ResourceNameTooltip.tsx | 19 ++++++++++--------- .../dashboard/DashboardPopupIconButton.tsx | 19 +++++++++++++++++++ .../src/pages/BYONImages/BYONImagesTable.tsx | 8 +++++++- .../projects/screens/spawner/spawnerUtils.ts | 14 +++++++++++++- 4 files changed, 49 insertions(+), 11 deletions(-) create mode 100644 frontend/src/concepts/dashboard/DashboardPopupIconButton.tsx diff --git a/frontend/src/components/ResourceNameTooltip.tsx b/frontend/src/components/ResourceNameTooltip.tsx index b7554f6ec2..a43f541dcb 100644 --- a/frontend/src/components/ResourceNameTooltip.tsx +++ b/frontend/src/components/ResourceNameTooltip.tsx @@ -1,17 +1,18 @@ import * as React from 'react'; import { + ClipboardCopy, DescriptionList, DescriptionListDescription, DescriptionListGroup, DescriptionListTerm, - Icon, + Popover, Stack, StackItem, - Tooltip, } from '@patternfly/react-core'; import { OutlinedQuestionCircleIcon } from '@patternfly/react-icons'; import { K8sResourceCommon } from '@openshift/dynamic-plugin-sdk-utils'; import '~/pages/notebookController/NotebookController.scss'; +import DashboardPopupIconButton from '~/concepts/dashboard/DashboardPopupIconButton'; type ResourceNameTooltipProps = { resource: K8sResourceCommon; @@ -23,10 +24,10 @@ const ResourceNameTooltip: React.FC = ({ children, res {children}{' '} {resource.metadata?.name && (
- Resource names and types are used to find your resources in OpenShift. @@ -36,7 +37,9 @@ const ResourceNameTooltip: React.FC = ({ children, res Resource name - {resource.metadata.name} + + {resource.metadata?.name} + @@ -48,10 +51,8 @@ const ResourceNameTooltip: React.FC = ({ children, res } > - - - - + } aria-label="More info" /> +
)} diff --git a/frontend/src/concepts/dashboard/DashboardPopupIconButton.tsx b/frontend/src/concepts/dashboard/DashboardPopupIconButton.tsx new file mode 100644 index 0000000000..461e2a40cb --- /dev/null +++ b/frontend/src/concepts/dashboard/DashboardPopupIconButton.tsx @@ -0,0 +1,19 @@ +import React from 'react'; +import { Button, ButtonProps, Icon } from '@patternfly/react-core'; + +type DashboardPopupIconButtonProps = Omit & { + icon: React.ReactNode; +}; + +/** + * Overriding PF's button styles to allow for a11y in opening tooltips or popovers on a single item + */ +const DashboardPopupIconButton = ({ icon, ...props }: DashboardPopupIconButtonProps) => ( + +); + +export default DashboardPopupIconButton; diff --git a/frontend/src/pages/BYONImages/BYONImagesTable.tsx b/frontend/src/pages/BYONImages/BYONImagesTable.tsx index 5eaa56bec7..3ddc0ee938 100644 --- a/frontend/src/pages/BYONImages/BYONImagesTable.tsx +++ b/frontend/src/pages/BYONImages/BYONImagesTable.tsx @@ -35,6 +35,8 @@ import { BYONImage } from '~/types'; import { relativeTime } from '~/utilities/time'; import { updateBYONImage } from '~/services/imagesService'; import ImageErrorStatus from '~/pages/BYONImages/ImageErrorStatus'; +import ResourceNameTooltip from '~/components/ResourceNameTooltip'; +import { convertBYONImageToK8sResource } from '~/pages/projects/screens/spawner/spawnerUtils'; import { ImportImageModal } from './ImportImageModal'; import { DeleteImageModal } from './DeleteBYONImageModal'; import { UpdateImageModal } from './UpdateImageModal'; @@ -329,7 +331,11 @@ export const BYONImagesTable: React.FC = ({ images, forceU spaceItems={{ default: 'spaceItemsSm' }} alignItems={{ default: 'alignItemsCenter' }} > - {image.name} + + + {image.name} + + diff --git a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts index 8231af7e7a..dddafdac94 100644 --- a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts +++ b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts @@ -1,6 +1,7 @@ import * as React from 'react'; import compareVersions from 'compare-versions'; -import { NotebookSize, Volume, VolumeMount } from '~/types'; +import { K8sResourceCommon } from '@openshift/dynamic-plugin-sdk-utils'; +import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types'; import { BuildKind, ImageStreamKind, ImageStreamSpecTagType, NotebookKind } from '~/k8sTypes'; import { ConfigMapCategory, @@ -388,3 +389,14 @@ export const isInvalidBYONImageStream = (imageStream: ImageStreamKind) => { (activeTag === undefined || activeTag.items === null) ); }; + +export const convertBYONImageToK8sResource = (image: BYONImage): K8sResourceCommon => ({ + kind: 'ImageStream', + apiVersion: 'image.openshift.io/v1', + metadata: { + name: image.id, + annotations: { + 'openshift.io/display-name': image.name, + }, + }, +}); From 50839ac5ff344f2760655f3604bfb5774ecb017b Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 27 Jul 2023 13:52:24 -0500 Subject: [PATCH 03/22] added gpu migration --- backend/src/plugins/kube.ts | 13 ++-- backend/src/types.ts | 16 +++++ backend/src/utils/resourceUtils.ts | 112 +++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 7 deletions(-) diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts index ee1d78c87f..232651aa5f 100644 --- a/backend/src/plugins/kube.ts +++ b/backend/src/plugins/kube.ts @@ -4,7 +4,7 @@ import { FastifyInstance } from 'fastify'; import * as jsYaml from 'js-yaml'; import * as k8s from '@kubernetes/client-node'; import { DEV_MODE } from '../utils/constants'; -import { cleanupDSPSuffix, initializeWatchedResources } from '../utils/resourceUtils'; +import { cleanupGPU, initializeWatchedResources } from '../utils/resourceUtils'; import { User } from '@kubernetes/client-node/dist/config_types'; const CONSOLE_CONFIG_YAML_FIELD = 'console-config.yaml'; @@ -82,17 +82,16 @@ export default fp(async (fastify: FastifyInstance) => { // Initialize the watching of resources initializeWatchedResources(fastify); - // TODO: Delete this code in the future once we have no customers using RHODS 1.19 / ODH 2.4.0 - // Cleanup for display name suffix of [DSP] - cleanupDSPSuffix(fastify).catch((e) => + cleanupGPU(fastify).catch((e) => fastify.log.error( - `Unable to fully cleanup project display name suffixes - Some projects may not appear in the dashboard UI. ${ + `Unable to fully convert GPU to use accelerator profiles. ${ e.response?.body?.message || e.message }`, - ), - ); + ) + ) }); + const getCurrentNamespace = async () => { return new Promise((resolve, reject) => { if (currentContext === 'inClusterContext') { diff --git a/backend/src/types.ts b/backend/src/types.ts index 3c85ad8afa..6f07e9e098 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -893,4 +893,20 @@ export type ServingRuntime = K8sResourceCommon & { supportedModelFormats: SupportedModelFormats[]; replicas: number; }; +}; + +export type AcceleratorKind = K8sResourceCommon & { + metadata: { + name: string; + annotations?: Partial<{ + 'opendatahub.io/modified-date': string; + }>; + }; + spec: { + displayName: string; + enabled: boolean; + identifier: string; + description?: string; + tolerations?: NotebookToleration[]; + }; }; \ No newline at end of file diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index ee59a38b2e..597d4a6d31 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -2,6 +2,7 @@ import * as _ from 'lodash'; import createError from 'http-errors'; import { PatchUtils, V1ConfigMap, V1Namespace, V1NamespaceList } from '@kubernetes/client-node'; import { + AcceleratorKind, BUILD_PHASE, BuildKind, BuildStatus, @@ -606,6 +607,117 @@ export const getConsoleLinks = (): ConsoleLinkKind[] => { return consoleLinksWatcher.getResources(); }; +/** + * Converts GPU usage to use accelerator by adding an accelerator profile CRD to the cluster if GPU usage is detected + */ +export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => { + // When we startup — in kube.ts we can handle a migration (catch ALL promise errors — exit gracefully and use fastify logging) + // Check for migration-gpu-status configmap in dashboard namespace — if found, exit early + const CONFIG_MAP_NAME = 'migration-gpu-status'; + + const continueProcessing = await fastify.kube.coreV1Api + .readNamespacedConfigMap(CONFIG_MAP_NAME, fastify.kube.namespace) + .then(() => { + // Found configmap, not continuing + return false; + }) + .catch((e) => { + if (e.statusCode === 404) { + // No config saying we have already migrated gpus, continue + return true; + } + throw e; + }); + + if (continueProcessing) { + // Read existing AcceleratorProfiles + const acceleratorProfilesResponse = await fastify.kube.customObjectsApi + .listNamespacedCustomObject( + 'accelerator.openshift.io', + 'v1alpha', + fastify.kube.namespace, + 'acceleratorprofiles' + ) + + // If 404 shows up — CRD may not be installed, exit early + if (acceleratorProfilesResponse.response.statusCode === 404) { + return; + } + + const acceleratorProfiles = ( + acceleratorProfilesResponse?.body as { + items: AcceleratorKind[] + } + )?.items; + + // If not 404 and no profiles detected: + if (acceleratorProfiles && Array.isArray(acceleratorProfiles) && acceleratorProfiles.length === 0) { + // if gpu detected on cluster, create our default migrated-gpu + // TODO GPU detection + const gpu_detected = true; + + if (gpu_detected) { + const payload: AcceleratorKind = { + kind: 'AcceleratorProfile', + apiVersion: 'dashboard.opendatahub.io/v1alpha', + metadata: { + name: 'migrated-gpu', + namespace: fastify.kube.namespace, + }, + spec: { + displayName: 'Nvidia GPU', + identifier: 'nvidia.com/gpu', + enabled: true, + tolerations: [ + { + effect: 'NoSchedule', + key: 'nvidia.com/gpu', + operator: 'Exists', + } + ] + }, + }; + + try { + await await fastify.kube.customObjectsApi.createNamespacedCustomObject( + 'accelerator.openshift.io', + 'v1alpha', + fastify.kube.namespace, + 'acceleratorprofiles', + payload + ) + } catch (e) { + // If bad detection — exit early and dont create config + if (e.response?.statusCode !== 404) { + fastify.log.error('Unable to add migrated-gpu accelerator profile: ' + e.toString()); + return; + } + } + }; + } + + // Create configmap to flag operation as successful + const configMap = { + metadata: { + name: CONFIG_MAP_NAME, + namespace: fastify.kube.namespace, + }, + data: { + migratedCompleted: 'true', + }, + } + + await fastify.kube.coreV1Api + .createNamespacedConfigMap(fastify.kube.namespace, configMap) + .then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles')) + .catch((e) => { + throw createCustomError( + 'Unable to create gpu migration configmap', + e.response?.body?.message || e.message, + ); + }); + } +} /** * @deprecated - Look to remove asap (see comments below) * Converts namespaces that have a display-name annotation suffixed with `[DSP]` over to using a label. From 84c2231ba1a4366ffe5226298da851763511c603 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Fri, 28 Jul 2023 08:27:31 -0500 Subject: [PATCH 04/22] added accelerator detection --- .../api/accelerators/acceleratorUtils.ts | 46 +++++++++++++++++++ backend/src/routes/api/accelerators/index.ts | 11 +++++ backend/src/routes/api/gpu/gpuUtils.ts | 3 ++ backend/src/routes/api/gpu/index.ts | 3 ++ backend/src/types.ts | 8 ++++ 5 files changed, 71 insertions(+) create mode 100644 backend/src/routes/api/accelerators/acceleratorUtils.ts create mode 100644 backend/src/routes/api/accelerators/index.ts diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts new file mode 100644 index 0000000000..7ea0068e0e --- /dev/null +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -0,0 +1,46 @@ +import { AcceleratorInfo, KubeFastifyInstance } from "../../../types" + +const RESOURCE_TYPES = ["cpu", "memory", "pods", "ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi"] + +const getIdentifiersFromResources = (resources: {[key: string]: string} = {}) => { + return Object.entries(resources) + .filter(([key,]) => !RESOURCE_TYPES.includes(key)) + .reduce<{[key: string]: number}>((identifiers, [key, value]) => { + identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value) + return identifiers + }, {}) +} + +export const getAcceleratorNumbers = async (fastify: KubeFastifyInstance): Promise => ( + fastify.kube.coreV1Api.listNode() + .then((res) => res.body.items.reduce((info, node) => { + // reduce resources down to just the accelerators and their counts + const allocatable = getIdentifiersFromResources(node.status.allocatable) + const capacity = getIdentifiersFromResources(node.status.capacity) + + // update the max count for each accelerator + Object.entries(allocatable).forEach(([key, value]) => ( + info.available[key] = Math.max((info.available[key] || 0), value) + )) + + // update the total count for each accelerator + Object.entries(capacity).forEach(([key, value]) => ( + info.total[key] = (info.total[key] || 0) + value + )) + + + // update the allocated count for each accelerator + Object.entries(capacity).forEach(([key, value]) => ( + info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0) + )) + + // if any accelerators are available, the cluster is configured + const configured = info.configured || Object.values(info.available).some((value) => value > 0) + + return {total: info.total, available: info.available, allocated: info.allocated, configured} + }, {configured: false, available: {}, total: {}, allocated: {}})) + .catch((e) => { + fastify.log.error(`Exception when listing cluster nodes: ${e}`); + return {configured: false, available: {}, total: {}, allocated: {}} + }) +) diff --git a/backend/src/routes/api/accelerators/index.ts b/backend/src/routes/api/accelerators/index.ts new file mode 100644 index 0000000000..16d651ad6d --- /dev/null +++ b/backend/src/routes/api/accelerators/index.ts @@ -0,0 +1,11 @@ +import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types'; +import { getAcceleratorNumbers } from './acceleratorUtils'; +import { logRequestDetails } from '../../../utils/fileUtils'; + +export default async (fastify: KubeFastifyInstance): Promise => { + fastify.get('/', async (request: OauthFastifyRequest) => { + logRequestDetails(fastify, request); + + return getAcceleratorNumbers(fastify); + }); +}; diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts index e70cad3ece..fc2b74d0dd 100644 --- a/backend/src/routes/api/gpu/gpuUtils.ts +++ b/backend/src/routes/api/gpu/gpuUtils.ts @@ -16,6 +16,9 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = { lastFetch: 0, }; +/** + * @deprecated - use getAcceleratorNumbers instead + */ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => { if (storage.lastFetch >= Date.now() - 30_000) { fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`); diff --git a/backend/src/routes/api/gpu/index.ts b/backend/src/routes/api/gpu/index.ts index 5d91bb04c3..dc7068851c 100644 --- a/backend/src/routes/api/gpu/index.ts +++ b/backend/src/routes/api/gpu/index.ts @@ -2,6 +2,9 @@ import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types'; import { getGPUNumber } from './gpuUtils'; import { logRequestDetails } from '../../../utils/fileUtils'; +/** + * @deprecated - use accelerators instead + */ export default async (fastify: KubeFastifyInstance): Promise => { fastify.get('/', async (request: OauthFastifyRequest) => { logRequestDetails(fastify, request); diff --git a/backend/src/types.ts b/backend/src/types.ts index 3c85ad8afa..57ae0bd50e 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -748,6 +748,14 @@ export type GPUInfo = { available: number; autoscalers: gpuScale[]; }; + +export type AcceleratorInfo = { + configured: boolean; + available: {[key: string]: number}; + total: {[key: string]: number}; + allocated: {[key: string]: number}; +} + export type EnvironmentVariable = EitherNotBoth< { value: string | number }, { valueFrom: Record } From 391cbca5d1a807281cef3bd5d6f5269570cc6011 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 1 Aug 2023 15:09:25 -0500 Subject: [PATCH 05/22] added accelerator detection line --- backend/src/utils/resourceUtils.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index 597d4a6d31..40b081ec88 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -654,9 +654,9 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => if (acceleratorProfiles && Array.isArray(acceleratorProfiles) && acceleratorProfiles.length === 0) { // if gpu detected on cluster, create our default migrated-gpu // TODO GPU detection - const gpu_detected = true; + const acceleratorDetected = await getAcceleratorNumbers(fastify) - if (gpu_detected) { + if (acceleratorDetected.configured) { const payload: AcceleratorKind = { kind: 'AcceleratorProfile', apiVersion: 'dashboard.opendatahub.io/v1alpha', From 26da28943967a71c31b53418866eda37041861ae Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 1 Aug 2023 16:40:25 -0500 Subject: [PATCH 06/22] fix error state in migration --- backend/src/utils/resourceUtils.ts | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index 40b081ec88..b5d83ac450 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -32,6 +32,7 @@ import { getRouteForClusterId, } from './componentUtils'; import { createCustomError } from './requestUtils'; +import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils'; const dashboardConfigMapName = 'odh-dashboard-config'; const consoleLinksGroup = 'console.openshift.io'; @@ -626,23 +627,21 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => // No config saying we have already migrated gpus, continue return true; } - throw e; }); + if (continueProcessing) { // Read existing AcceleratorProfiles const acceleratorProfilesResponse = await fastify.kube.customObjectsApi .listNamespacedCustomObject( - 'accelerator.openshift.io', + 'dashboard.opendatahub.io', 'v1alpha', fastify.kube.namespace, 'acceleratorprofiles' - ) - - // If 404 shows up — CRD may not be installed, exit early - if (acceleratorProfilesResponse.response.statusCode === 404) { - return; - } + ).catch((e) => { + // If 404 shows up — CRD may not be installed, exit early + throw 'Unable to fetch accelerator profiles: ' + e.toString() + }); const acceleratorProfiles = ( acceleratorProfilesResponse?.body as { @@ -650,7 +649,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => } )?.items; - // If not 404 and no profiles detected: + // If not error and no profiles detected: if (acceleratorProfiles && Array.isArray(acceleratorProfiles) && acceleratorProfiles.length === 0) { // if gpu detected on cluster, create our default migrated-gpu // TODO GPU detection @@ -680,7 +679,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => try { await await fastify.kube.customObjectsApi.createNamespacedCustomObject( - 'accelerator.openshift.io', + 'dashboard.opendatahub.io', 'v1alpha', fastify.kube.namespace, 'acceleratorprofiles', @@ -688,10 +687,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => ) } catch (e) { // If bad detection — exit early and dont create config - if (e.response?.statusCode !== 404) { - fastify.log.error('Unable to add migrated-gpu accelerator profile: ' + e.toString()); - return; - } + throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString() } }; } From ab07f22480a547c0708ddbc358670584557fde67 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 1 Aug 2023 16:48:20 -0500 Subject: [PATCH 07/22] added more resource types --- backend/src/routes/api/accelerators/acceleratorUtils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts index 7ea0068e0e..09ae2ddba6 100644 --- a/backend/src/routes/api/accelerators/acceleratorUtils.ts +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -1,6 +1,6 @@ import { AcceleratorInfo, KubeFastifyInstance } from "../../../types" -const RESOURCE_TYPES = ["cpu", "memory", "pods", "ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi"] +const RESOURCE_TYPES = ["cpu", "memory", "pods", "ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "attachable-volumes-aws-ebs"] const getIdentifiersFromResources = (resources: {[key: string]: string} = {}) => { return Object.entries(resources) From 9387956a16ec1b5fef61a759fd482d591a5a6c1d Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Fri, 30 Jun 2023 14:56:37 -0500 Subject: [PATCH 08/22] added accelerator UI user flow fixed detected accelerator count connected accelerator detection added accelerator UI user flow hide accelerator dropdown when empty switched the format of the notebook identifier added accelerator name to serving runtime resource added serving runtimes accelerators --- backend/src/routes/api/gpu/gpuUtils.ts | 7 + docs/dashboard_config.md | 1 - frontend/src/api/index.ts | 1 + frontend/src/api/k8s/accelerators.ts | 8 + frontend/src/api/k8s/notebooks.ts | 7 +- frontend/src/api/k8s/servingRuntimes.ts | 9 +- frontend/src/api/k8s/utils.ts | 47 ++-- frontend/src/api/models/openShift.ts | 7 + .../src/components/SimpleDropdownSelect.scss | 3 + .../src/components/SimpleDropdownSelect.tsx | 52 +++-- frontend/src/k8sTypes.ts | 24 ++- .../projects/ServingRuntimeDetails.tsx | 11 +- .../ManageServingRuntimeModal.tsx | 4 +- .../ServingRuntimeSizeSection.tsx | 44 ++-- .../ServingRuntimeTemplateSection.tsx | 58 ++--- .../projects/useServingRuntimeAccelerator.ts | 42 ++++ .../modelServing/screens/projects/utils.ts | 19 +- .../src/pages/modelServing/screens/types.ts | 3 +- .../screens/server/AcceleratorSelectField.tsx | 200 ++++++++++++++++++ .../screens/server/NotebookServerDetails.tsx | 14 +- .../screens/server/SpawnerPage.tsx | 17 +- .../screens/server/useAcceleratorCounts.ts | 13 ++ .../screens/server/useAccelerators.ts | 8 + .../notebook/NotebookStatusToggle.tsx | 11 +- .../notebooks/useNotebookAccelerator.ts | 49 +++++ .../detail/notebooks/useNotebookGPUNumber.ts | 14 -- .../screens/spawner/SpawnerFooter.tsx | 5 +- .../projects/screens/spawner/SpawnerPage.tsx | 40 +++- .../imageSelector/ImageSelectorField.tsx | 3 + .../imageSelector/ImageStreamSelector.tsx | 74 +++---- .../projects/screens/spawner/spawnerUtils.ts | 54 +++-- .../pages/projects/screens/spawner/types.ts | 7 +- frontend/src/pages/projects/types.ts | 3 +- frontend/src/services/acceleratorService.ts | 12 ++ frontend/src/types.ts | 26 ++- frontend/src/utilities/imageUtils.ts | 5 - frontend/src/utilities/tolerations.ts | 18 +- .../src/utilities/useGenericObjectState.ts | 2 +- ...acceleratorprofile.opendatahub.io.crd.yaml | 39 ---- ...cceleratorprofiles.opendatahub.io.crd.yaml | 27 ++- 40 files changed, 708 insertions(+), 280 deletions(-) create mode 100644 frontend/src/api/k8s/accelerators.ts create mode 100644 frontend/src/components/SimpleDropdownSelect.scss create mode 100644 frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts create mode 100644 frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx create mode 100644 frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts create mode 100644 frontend/src/pages/notebookController/screens/server/useAccelerators.ts create mode 100644 frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts delete mode 100644 frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts create mode 100644 frontend/src/services/acceleratorService.ts delete mode 100644 manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts index e70cad3ece..9755a21d54 100644 --- a/backend/src/routes/api/gpu/gpuUtils.ts +++ b/backend/src/routes/api/gpu/gpuUtils.ts @@ -16,6 +16,9 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = { lastFetch: 0, }; +/** + * @deprecated + */ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => { if (storage.lastFetch >= Date.now() - 30_000) { fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`); @@ -67,11 +70,15 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => + k8sListResource({ + model: AcceleratorModel, + }).then((listResource) => listResource.items); diff --git a/frontend/src/api/k8s/notebooks.ts b/frontend/src/api/k8s/notebooks.ts index 80a87d17fb..15f825a846 100644 --- a/frontend/src/api/k8s/notebooks.ts +++ b/frontend/src/api/k8s/notebooks.ts @@ -39,7 +39,7 @@ const assembleNotebook = ( description, notebookSize, envFrom, - gpus, + accelerator, image, volumes: formVolumes, volumeMounts: formVolumeMounts, @@ -51,7 +51,7 @@ const assembleNotebook = ( const { affinity, tolerations, resources } = assemblePodSpecOptions( notebookSize.resources, - gpus, + accelerator, tolerationSettings, ); @@ -88,6 +88,7 @@ const assembleNotebook = ( 'notebooks.opendatahub.io/last-image-selection': imageSelection, 'notebooks.opendatahub.io/inject-oauth': 'true', 'opendatahub.io/username': username, + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }, name: notebookId, namespace: projectName, @@ -260,7 +261,7 @@ export const updateNotebook = ( // clean the envFrom array in case of merging the old value again container.envFrom = []; - // clean the resources, affinity and tolerations for GPU + // clean the resources, affinity and tolerations for accelerator oldNotebook.spec.template.spec.tolerations = []; oldNotebook.spec.template.spec.affinity = {}; container.resources = {}; diff --git a/frontend/src/api/k8s/servingRuntimes.ts b/frontend/src/api/k8s/servingRuntimes.ts index 07a0b2cc15..9b3998f788 100644 --- a/frontend/src/api/k8s/servingRuntimes.ts +++ b/frontend/src/api/k8s/servingRuntimes.ts @@ -23,7 +23,7 @@ const assembleServingRuntime = ( isCustomServingRuntimesEnabled: boolean, isEditing?: boolean, ): ServingRuntimeKind => { - const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth, gpus } = data; + const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth, accelerator } = data; const createName = isCustomServingRuntimesEnabled ? translateDisplayNameForK8s(displayName) : getModelServingRuntimeName(namespace); @@ -50,6 +50,7 @@ const assembleServingRuntime = ( }), ...(isCustomServingRuntimesEnabled && { 'opendatahub.io/template-display-name': getDisplayNameFromK8sResource(servingRuntime), + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }), }, }; @@ -60,6 +61,7 @@ const assembleServingRuntime = ( ...updatedServingRuntime.metadata.annotations, 'enable-route': externalRoute ? 'true' : 'false', 'enable-auth': tokenAuth ? 'true' : 'false', + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', ...(isCustomServingRuntimesEnabled && { 'openshift.io/display-name': displayName }), }, }; @@ -77,7 +79,10 @@ const assembleServingRuntime = ( }, }; - const { affinity, tolerations, resources } = assemblePodSpecOptions(resourceSettings, gpus); + const { affinity, tolerations, resources } = assemblePodSpecOptions( + resourceSettings, + accelerator, + ); updatedServingRuntime.spec.containers = servingRuntime.spec.containers.map((container) => ({ ...container, diff --git a/frontend/src/api/k8s/utils.ts b/frontend/src/api/k8s/utils.ts index 920415d757..883df66e5c 100644 --- a/frontend/src/api/k8s/utils.ts +++ b/frontend/src/api/k8s/utils.ts @@ -1,3 +1,4 @@ +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { PodAffinity, ContainerResources, @@ -9,7 +10,7 @@ import { determineTolerations } from '~/utilities/tolerations'; export const assemblePodSpecOptions = ( resourceSettings: ContainerResources, - gpus: number, + accelerator: AcceleratorState, tolerationSettings?: TolerationSettings, affinitySettings?: PodAffinity, ): { @@ -17,40 +18,34 @@ export const assemblePodSpecOptions = ( tolerations: PodToleration[]; resources: ContainerResources; } => { - let affinity: PodAffinity = structuredClone(affinitySettings || {}); + const affinity: PodAffinity = structuredClone(affinitySettings || {}); const resources = structuredClone(resourceSettings); - if (gpus > 0) { + if (accelerator.count > 0 && accelerator.accelerator) { if (!resources.limits) { resources.limits = {}; } if (!resources.requests) { resources.requests = {}; } - resources.limits[ContainerResourceAttributes.NVIDIA_GPU] = gpus; - resources.requests[ContainerResourceAttributes.NVIDIA_GPU] = gpus; + resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; + resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { - delete resources.limits?.[ContainerResourceAttributes.NVIDIA_GPU]; - delete resources.requests?.[ContainerResourceAttributes.NVIDIA_GPU]; - affinity = { - nodeAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - { - preference: { - matchExpressions: [ - { - key: 'nvidia.com/gpu.present', - operator: 'NotIn', - values: ['true'], - }, - ], - }, - weight: 1, - }, - ], - }, - }; + // step type down to string to avoid type errors + const containerResourceKeys: string[] = Object.keys(ContainerResourceAttributes); + + Object.keys(resources.limits || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.limits?.[key]; + } + }); + + Object.keys(resources.requests || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.requests?.[key]; + } + }); } - const tolerations = determineTolerations(gpus > 0, tolerationSettings); + const tolerations = determineTolerations(tolerationSettings, accelerator.accelerator); return { affinity, tolerations, resources }; }; diff --git a/frontend/src/api/models/openShift.ts b/frontend/src/api/models/openShift.ts index 543ff77a38..3d4ecf4735 100644 --- a/frontend/src/api/models/openShift.ts +++ b/frontend/src/api/models/openShift.ts @@ -55,3 +55,10 @@ export const TemplateModel: K8sModelCommon = { kind: 'Template', plural: 'templates', }; + +export const AcceleratorModel: K8sModelCommon = { + apiVersion: 'v1alpha', + apiGroup: 'dashboard.opendatahub.io', + kind: 'AcceleratorProfile', + plural: 'acceleratorprofiles', +}; diff --git a/frontend/src/components/SimpleDropdownSelect.scss b/frontend/src/components/SimpleDropdownSelect.scss new file mode 100644 index 0000000000..bcb8baf49f --- /dev/null +++ b/frontend/src/components/SimpleDropdownSelect.scss @@ -0,0 +1,3 @@ +.full-width { + width: 100%; +} \ No newline at end of file diff --git a/frontend/src/components/SimpleDropdownSelect.tsx b/frontend/src/components/SimpleDropdownSelect.tsx index c1d0775549..a987e0d270 100644 --- a/frontend/src/components/SimpleDropdownSelect.tsx +++ b/frontend/src/components/SimpleDropdownSelect.tsx @@ -1,11 +1,20 @@ import * as React from 'react'; import { Dropdown, DropdownItem, DropdownToggle } from '@patternfly/react-core'; +import './SimpleDropdownSelect.scss'; type SimpleDropdownProps = { - options: { key: string; label: React.ReactNode }[]; + options: { + key: string; + label: React.ReactNode; + description?: React.ReactNode; + selectedLabel?: React.ReactNode; + isPlaceholder?: boolean; + }[]; value: string; placeholder?: string; - onChange: (key: string) => void; + onChange: (key: string, isPlaceholder: boolean) => void; + isFullWidth?: boolean; + isDisabled?: boolean; } & Omit, 'isOpen' | 'toggle' | 'dropdownItems' | 'onChange'>; const SimpleDropdownSelect: React.FC = ({ @@ -13,30 +22,43 @@ const SimpleDropdownSelect: React.FC = ({ options, placeholder = 'Select...', value, + isFullWidth, + isDisabled, ...props }) => { const [open, setOpen] = React.useState(false); + const selectedOption = options.find(({ key }) => key === value); + const selectedLabel = selectedOption?.selectedLabel ?? selectedOption?.label ?? placeholder; + return ( setOpen(!open)}> - <>{options.find(({ key }) => key === value)?.label ?? placeholder} + setOpen(!open)} + > + <>{selectedLabel} } - dropdownItems={options.map(({ key, label }) => ( - { - onChange(key); - setOpen(false); - }} - > - {label} - - ))} + dropdownItems={options + .sort((a, b) => (a.isPlaceholder === b.isPlaceholder ? 0 : a.isPlaceholder ? -1 : 1)) + .map(({ key, label, description, isPlaceholder }) => ( + { + onChange(key, !!isPlaceholder); + setOpen(false); + }} + > + {isPlaceholder ? {label} : label} + + ))} /> ); }; diff --git a/frontend/src/k8sTypes.ts b/frontend/src/k8sTypes.ts index dc2b3f209b..00774fbedc 100644 --- a/frontend/src/k8sTypes.ts +++ b/frontend/src/k8sTypes.ts @@ -43,7 +43,10 @@ type DisplayNameAnnotations = Partial<{ export type K8sDSGResource = K8sResourceCommon & { metadata: { - annotations?: DisplayNameAnnotations; + annotations?: DisplayNameAnnotations & + Partial<{ + 'opendatahub.io/recommended-accelerators': string; + }>; name: string; }; }; @@ -69,6 +72,7 @@ export type NotebookAnnotations = Partial<{ 'opendatahub.io/username': string; // the untranslated username behind the notebook 'notebooks.opendatahub.io/last-image-selection': string; // the last image they selected 'notebooks.opendatahub.io/last-size-selection': string; // the last notebook size they selected + 'opendatahub.io/accelerator-name': string; // the accelerator attached to the notebook }>; export type DashboardLabels = { @@ -91,6 +95,8 @@ export type ServingRuntimeAnnotations = Partial<{ 'opendatahub.io/template-name': string; 'opendatahub.io/template-display-name': string; 'opendatahub.io/disable-gpu': string; + 'opendatahub.io/recommended-accelerators': string; + 'opendatahub.io/accelerator-name': string; 'enable-route': string; 'enable-auth': string; }>; @@ -715,3 +721,19 @@ export type DashboardConfigKind = K8sResourceCommon & { templateOrder?: string[]; }; }; + +export type AcceleratorKind = K8sResourceCommon & { + metadata: { + name: string; + annotations?: Partial<{ + 'opendatahub.io/modified-date': string; + }>; + }; + spec: { + displayName: string; + enabled: boolean; + identifier: string; + description?: string; + tolerations?: PodToleration[]; + }; +}; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx index f50ab5d8ce..d80cfd7ad7 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx @@ -10,8 +10,8 @@ import { } from '@patternfly/react-core'; import { ServingRuntimeKind } from '~/k8sTypes'; import { AppContext } from '~/app/AppContext'; -import { ContainerResourceAttributes } from '~/types'; import { getServingRuntimeSizes } from './utils'; +import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; type ServingRuntimeDetailsProps = { obj: ServingRuntimeKind; @@ -22,6 +22,7 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => const container = obj.spec.containers[0]; // can we assume the first container? const sizes = getServingRuntimeSizes(dashboardConfig); const size = sizes.find((size) => _.isEqual(size.resources, container.resources)); + const [accelerator] = useServingRuntimeAccelerator(obj); return ( @@ -44,11 +45,15 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => - Number of GPUs + Accelerator - {container.resources.limits?.[ContainerResourceAttributes.NVIDIA_GPU] || 0} + {accelerator.accelerator?.spec.displayName || 'unknown'} + + Number of accelerators + {accelerator.count} + ); }; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx index a9ae72648b..393bae629d 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx @@ -127,7 +127,9 @@ const ManageServingRuntimeModal: React.FC = ({ } const servingRuntimeData = { ...createData, - gpus: isGpuDisabled(servingRuntimeSelected) ? 0 : createData.gpus, + accelerator: isGpuDisabled(servingRuntimeSelected) + ? { accelerator: undefined, count: 0 } + : createData.accelerator, }; const servingRuntimeName = translateDisplayNameForK8s(servingRuntimeData.name); const createRolebinding = servingRuntimeData.tokenAuth && allowCreate; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx index bd04dad8d4..26432ffca8 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx @@ -2,7 +2,6 @@ import * as React from 'react'; import { FormGroup, FormSection, - NumberInput, Select, SelectOption, Stack, @@ -13,9 +12,10 @@ import { CreatingServingRuntimeObject, ServingRuntimeSize, } from '~/pages/modelServing/screens/types'; -import useGPUSetting from '~/pages/notebookController/screens/server/useGPUSetting'; import { ServingRuntimeKind } from '~/k8sTypes'; import { isGpuDisabled } from '~/pages/modelServing/screens/projects/utils'; +import AcceleratorSelectField from '~/pages/notebookController/screens/server/AcceleratorSelectField'; +import { getCompatibleAcceleratorIdentifiers } from '~/pages/projects/screens/spawner/spawnerUtils'; import ServingRuntimeSizeExpandedField from './ServingRuntimeSizeExpandedField'; type ServingRuntimeSizeSectionProps = { @@ -32,7 +32,15 @@ const ServingRuntimeSizeSection: React.FC = ({ servingRuntimeSelected, }) => { const [sizeDropdownOpen, setSizeDropdownOpen] = React.useState(false); - const { available: gpuAvailable, count: gpuCount } = useGPUSetting('autodetect'); + const [supportedAccelerators, setSupportedAccelerators] = React.useState(); + + React.useEffect(() => { + if (servingRuntimeSelected) { + setSupportedAccelerators(getCompatibleAcceleratorIdentifiers(servingRuntimeSelected)); + } else { + setSupportedAccelerators(undefined); + } + }, [servingRuntimeSelected]); const gpuDisabled = servingRuntimeSelected ? isGpuDisabled(servingRuntimeSelected) : false; @@ -88,25 +96,17 @@ const ServingRuntimeSizeSection: React.FC = ({ )} - {gpuAvailable && !gpuDisabled && ( - - ) => { - const target = event.currentTarget; - setData('gpus', parseInt(target.value) || 0); - }} - onBlur={(event: React.FormEvent) => { - const target = event.currentTarget; - const gpuInput = parseInt(target.value) || 0; - setData('gpus', Math.max(0, Math.min(gpuCount, gpuInput))); - }} - onMinus={() => setData('gpus', data.gpus - 1)} - onPlus={() => setData('gpus', data.gpus + 1)} + {!gpuDisabled && ( + + + setData('accelerator', { ...data.accelerator, accelerator }) + } + setAcceleratorCount={(count) => setData('accelerator', { ...data.accelerator, count })} + supportedAccelerators={supportedAccelerators} + supportedText="Compatible with serving runtime" /> )} diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx index 0b8f9ddd46..fddc781f66 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx @@ -1,5 +1,5 @@ import * as React from 'react'; -import { FormGroup, Select, SelectOption, StackItem, TextInput } from '@patternfly/react-core'; +import { FormGroup, Label, Split, SplitItem, StackItem, TextInput } from '@patternfly/react-core'; import { UpdateObjectAtPropAndValue } from '~/pages/projects/types'; import { CreatingServingRuntimeObject } from '~/pages/modelServing/screens/types'; import { TemplateKind } from '~/k8sTypes'; @@ -7,6 +7,8 @@ import { getServingRuntimeDisplayNameFromTemplate, getServingRuntimeNameFromTemplate, } from '~/pages/modelServing/customServingRuntimes/utils'; +import { isCompatibleWithAccelerator } from '~/pages/projects/screens/spawner/spawnerUtils'; +import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; type ServingRuntimeTemplateSectionProps = { data: CreatingServingRuntimeObject; @@ -21,16 +23,22 @@ const ServingRuntimeTemplateSection: React.FC { - const [isOpen, setOpen] = React.useState(false); - - const options = templates.map((template) => ( - - {getServingRuntimeDisplayNameFromTemplate(template)} - - )); + const options = templates.map((template) => ({ + key: getServingRuntimeNameFromTemplate(template), + selectedLabel: getServingRuntimeDisplayNameFromTemplate(template), + label: ( + + {getServingRuntimeDisplayNameFromTemplate(template)} + + + {isCompatibleWithAccelerator( + data.accelerator.accelerator?.spec.identifier, + template.objects[0], + ) && } + + + ), + })); return ( <> @@ -46,22 +54,20 @@ const ServingRuntimeTemplateSection: React.FC - + id="serving-runtime-template-selection" + aria-label="Select a template" + options={options} + placeholder={ + isEditing || templates.length === 0 ? data.servingRuntimeTemplateName : 'Select one' + } + value={data.servingRuntimeTemplateName ?? ''} + onChange={(name) => { + setData('servingRuntimeTemplateName', name); + }} + /> diff --git a/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts b/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts new file mode 100644 index 0000000000..fae8b158b9 --- /dev/null +++ b/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts @@ -0,0 +1,42 @@ +import React, { useRef } from 'react'; +import { ServingRuntimeKind } from '~/k8sTypes'; +import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; +import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; + +const useServingRuntimeAccelerator = ( + servingRuntime?: ServingRuntimeKind, +): GenericObjectState => { + const [acceleratorState, setData, resetData] = useGenericObjectState({ + accelerator: undefined, + count: 0, + }); + + const hasSet = useRef(false); + + const [accelerators, loaded, loadError] = useAccelerators(); + + React.useEffect(() => { + if ( + loaded && + !loadError && + servingRuntime && + servingRuntime?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && + !hasSet.current + ) { + const name = servingRuntime.metadata.annotations['opendatahub.io/accelerator-name']; + const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); + const container = servingRuntime?.spec.containers[0]; + + if (accelerator && container) { + hasSet.current = true; + setData('accelerator', accelerator); + setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier]) ?? 0); + } + } + }, [accelerators, loaded, loadError, servingRuntime, setData]); + + return [acceleratorState, setData, resetData]; +}; + +export default useServingRuntimeAccelerator; diff --git a/frontend/src/pages/modelServing/screens/projects/utils.ts b/frontend/src/pages/modelServing/screens/projects/utils.ts index 616789f8c1..54ef4e6d20 100644 --- a/frontend/src/pages/modelServing/screens/projects/utils.ts +++ b/frontend/src/pages/modelServing/screens/projects/utils.ts @@ -8,7 +8,7 @@ import { InferenceServiceStorageType, ServingRuntimeSize, } from '~/pages/modelServing/screens/types'; -import { ContainerResourceAttributes, DashboardConfig } from '~/types'; +import { DashboardConfig } from '~/types'; import { DEFAULT_MODEL_SERVER_SIZES } from '~/pages/modelServing/screens/const'; import { useAppContext } from '~/app/AppContext'; import { useDeepCompareMemoize } from '~/utilities/useDeepCompareMemoize'; @@ -16,6 +16,7 @@ import { EMPTY_AWS_SECRET_DATA } from '~/pages/projects/dataConnections/const'; import { getDisplayNameFromK8sResource } from '~/pages/projects/utils'; import { getDisplayNameFromServingRuntimeTemplate } from '~/pages/modelServing/customServingRuntimes/utils'; import { isCpuLimitEqual, isMemoryLimitEqual } from '~/utilities/valueUnits'; +import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; export const getServingRuntimeSizes = (config: DashboardConfig): ServingRuntimeSize[] => { let sizes = config.spec.modelServerSizes || []; @@ -54,6 +55,8 @@ export const useCreateServingRuntimeObject = (existingData?: { ] => { const { dashboardConfig } = useAppContext(); + const [existingAccelerator] = useServingRuntimeAccelerator(existingData?.servingRuntime); + const sizes = useDeepCompareMemoize(getServingRuntimeSizes(dashboardConfig)); const createModelState = useGenericObjectState({ @@ -61,7 +64,7 @@ export const useCreateServingRuntimeObject = (existingData?: { servingRuntimeTemplateName: '', numReplicas: 1, modelSize: sizes[0], - gpus: 0, + accelerator: existingAccelerator, externalRoute: false, tokenAuth: false, tokens: [], @@ -82,11 +85,6 @@ export const useCreateServingRuntimeObject = (existingData?: { const existingResources = existingData?.servingRuntime?.spec?.containers[0]?.resources || sizes[0].resources; - const existingGpus = - existingData?.servingRuntime?.spec?.containers[0]?.resources?.requests?.[ - ContainerResourceAttributes.NVIDIA_GPU - ] || 0; - const existingExternalRoute = existingData?.servingRuntime?.metadata.annotations?.['enable-route'] === 'true'; const existingTokenAuth = @@ -118,10 +116,7 @@ export const useCreateServingRuntimeObject = (existingData?: { resources: existingResources, }, ); - setCreateData( - 'gpus', - typeof existingGpus == 'string' ? parseInt(existingGpus) : existingGpus, - ); + setCreateData('accelerator', existingAccelerator); setCreateData('externalRoute', existingExternalRoute); setCreateData('tokenAuth', existingTokenAuth); setCreateData('tokens', existingTokens); @@ -131,7 +126,7 @@ export const useCreateServingRuntimeObject = (existingData?: { existingServingRuntimeTemplateName, existingNumReplicas, existingResources, - existingGpus, + existingAccelerator, existingExternalRoute, existingTokenAuth, existingTokens, diff --git a/frontend/src/pages/modelServing/screens/types.ts b/frontend/src/pages/modelServing/screens/types.ts index 7b66c456c5..557f7bda26 100644 --- a/frontend/src/pages/modelServing/screens/types.ts +++ b/frontend/src/pages/modelServing/screens/types.ts @@ -1,3 +1,4 @@ +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { EnvVariableDataEntry } from '~/pages/projects/types'; import { ContainerResources } from '~/types'; @@ -34,7 +35,7 @@ export type CreatingServingRuntimeObject = { servingRuntimeTemplateName: string; numReplicas: number; modelSize: ServingRuntimeSize; - gpus: number; + accelerator: AcceleratorState; externalRoute: boolean; tokenAuth: boolean; tokens: ServingRuntimeToken[]; diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx new file mode 100644 index 0000000000..dcb0d45f62 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -0,0 +1,200 @@ +import * as React from 'react'; +import { + Alert, + AlertVariant, + FormGroup, + InputGroup, + Label, + NumberInput, + Split, + SplitItem, + Stack, + StackItem, +} from '@patternfly/react-core'; +import { isHTMLInputElement } from '~/utilities/utils'; +import { AcceleratorKind } from '~/k8sTypes'; +import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; +import useAccelerators from './useAccelerators'; +import useAcceleratorCounts from './useAcceleratorCounts'; + +type AcceleratorSelectFieldProps = { + accelerator?: AcceleratorKind; + setAccelerator: (accelerator?: AcceleratorKind) => void; + acceleratorCount?: number; + setAcceleratorCount: (size: number) => void; + supportedAccelerators?: string[]; + supportedText?: string; +}; + +const AcceleratorSelectField: React.FC = ({ + accelerator, + setAccelerator, + acceleratorCount = 0, + setAcceleratorCount, + supportedAccelerators, + supportedText, +}) => { + const [accelerators, loaded, loadError] = useAccelerators(); + const [detectedAcceleratorInfo] = useAcceleratorCounts(); + + const validateAcceleratorCount = React.useCallback( + (newSize: number) => { + if (!accelerator) { + return ''; + } + + const detectedAcceleratorCount = Object.entries(detectedAcceleratorInfo.available).find( + ([identifier]) => accelerator?.spec.identifier === identifier, + )?.[1]; + + if (detectedAcceleratorCount === undefined) { + return `No accelerator detected with the identifier ${accelerator?.spec.identifier} detected.`; + } else if (newSize > detectedAcceleratorCount) { + return `Only ${detectedAcceleratorCount} accelerator${ + detectedAcceleratorCount > 1 ? 's' : '' + } detected.`; + } + + return ''; + }, + [accelerator, detectedAcceleratorInfo.available], + ); + + React.useEffect(() => { + if (acceleratorCount > 0) { + setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); + } + }, [acceleratorCount, validateAcceleratorCount]); + + const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState( + validateAcceleratorCount(acceleratorCount), + ); + + const isAcceleratorSupported = (accelerator: AcceleratorKind) => + supportedAccelerators?.includes(accelerator.spec.identifier); + + const enabledAccelerators = accelerators.filter((ac) => ac.spec.enabled); + + const options = enabledAccelerators + .sort((a, b) => { + const aSupported = isAcceleratorSupported(a); + const bSupported = isAcceleratorSupported(b); + if (aSupported && !bSupported) { + return -1; + } + if (!aSupported && bSupported) { + return 1; + } + return 0; + }) + .map((ac) => ({ + key: ac.metadata.name, + selectedLabel: ac.spec.displayName, + description: ac.spec.description, + label: ( + + {ac.spec.displayName} + + + {isAcceleratorSupported(ac) && ( + + )} + + + ), + })); + + let acceleratorAlertMessage: { title: string; variant: AlertVariant } | null = null; + if (accelerator && supportedAccelerators !== undefined) { + if (supportedAccelerators?.length === 0) { + acceleratorAlertMessage = { + title: + "The image you have selected doesn't support the selected accelerator. It is recommended to use a compatible image for optimal performance.", + variant: AlertVariant.info, + }; + } else if (!isAcceleratorSupported(accelerator)) { + acceleratorAlertMessage = { + title: 'The image you have selected is not compatible with the selected accelerator', + variant: AlertVariant.warning, + }; + } + } + + const onStep = (step: number) => { + setAcceleratorCount(Math.max(acceleratorCount + step, 0)); + }; + + if (!loaded || loadError || enabledAccelerators.length === 0) { + return <>; + } + + return ( + + + + { + if (isPlaceholder) { + setAccelerator(undefined); + setAcceleratorCount(0); + } else { + setAccelerator(accelerators.find((ac) => ac.metadata.name === key)); + } + }} + > + + + {acceleratorAlertMessage && ( + + + + )} + {accelerator && ( + + + + onStep(1)} + onMinus={() => onStep(-1)} + onChange={(event) => { + if (isHTMLInputElement(event.target)) { + const newSize = Number(event.target.value); + setAcceleratorCount(newSize); + } + }} + /> + + + + )} + {acceleratorCountWarning && ( + + + + )} + + ); +}; + +export default AcceleratorSelectField; diff --git a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx index e111770c84..1dfb12a76a 100644 --- a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx +++ b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx @@ -16,11 +16,11 @@ import { getDescriptionForTag, getImageTagByContainer, getNameVersionString, - getNumGpus, } from '~/utilities/imageUtils'; import { useAppContext } from '~/app/AppContext'; import { useWatchImages } from '~/utilities/useWatchImages'; import { NotebookControllerContext } from '~/pages/notebookController/NotebookControllerContext'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { getNotebookSizes } from './usePreferredNotebookSize'; const NotebookServerDetails: React.FC = () => { @@ -28,6 +28,7 @@ const NotebookServerDetails: React.FC = () => { const { images, loaded } = useWatchImages(); const [isExpanded, setExpanded] = React.useState(false); const { dashboardConfig } = useAppContext(); + const [accelerator] = useNotebookAccelerator(notebook); const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( (container) => container.name === notebook.metadata.name, @@ -45,7 +46,6 @@ const NotebookServerDetails: React.FC = () => { const tagSoftware = getDescriptionForTag(tag); const tagDependencies = tag?.content.dependencies ?? []; - const numGpus = getNumGpus(container); const sizes = getNotebookSizes(dashboardConfig); const size = sizes.find((size) => _.isEqual(size.resources.limits, container.resources?.limits)); @@ -106,8 +106,14 @@ const NotebookServerDetails: React.FC = () => { {`${container.resources?.requests?.cpu} CPU, ${container.resources?.requests?.memory} Memory`} - Number of GPUs - {numGpus} + Accelerator + + {accelerator.accelerator?.spec.displayName || 'unknown'} + + + + Number of accelerators + {accelerator.count} diff --git a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx index 71a8e6ade5..985773e1e3 100644 --- a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx +++ b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx @@ -39,7 +39,7 @@ import ImpersonateAlert from '~/pages/notebookController/screens/admin/Impersona import useNamespaces from '~/pages/notebookController/useNamespaces'; import { fireTrackingEvent } from '~/utilities/segmentIOUtils'; import { getEnvConfigMap, getEnvSecret } from '~/services/envService'; -import GPUSelectField from './GPUSelectField'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import SizeSelectField from './SizeSelectField'; import useSpawnerNotebookModalState from './useSpawnerNotebookModalState'; import BrowserTabPreferenceCheckbox from './BrowserTabPreferenceCheckbox'; @@ -49,6 +49,7 @@ import { usePreferredNotebookSize } from './usePreferredNotebookSize'; import StartServerModal from './StartServerModal'; import '~/pages/notebookController/NotebookController.scss'; +import AcceleratorSelectField from './AcceleratorSelectField'; const SpawnerPage: React.FC = () => { const navigate = useNavigate(); @@ -68,7 +69,7 @@ const SpawnerPage: React.FC = () => { tag: undefined, }); const { selectedSize, setSelectedSize, sizes } = usePreferredNotebookSize(); - const [selectedGpu, setSelectedGpu] = React.useState('0'); + const [accelerator, setAccelerator] = useNotebookAccelerator(currentUserNotebook); const [variableRows, setVariableRows] = React.useState([]); const [submitError, setSubmitError] = React.useState(null); @@ -231,7 +232,8 @@ const SpawnerPage: React.FC = () => { const fireStartServerEvent = () => { fireTrackingEvent('Notebook Server Started', { - GPU: parseInt(selectedGpu), + accelerator: accelerator.accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', + acceleratorCount: accelerator.count, lastSelectedSize: selectedSize.name, lastSelectedImage: `${selectedImageTag.image?.name}:${selectedImageTag.tag?.name}`, }); @@ -246,7 +248,7 @@ const SpawnerPage: React.FC = () => { notebookSizeName: selectedSize.name, imageName: selectedImageTag.image?.name || '', imageTagName: selectedImageTag.tag?.name || '', - gpus: parseInt(selectedGpu), + accelerator: accelerator, envVars: envVars, state: NotebookState.Started, username: impersonatedUsername || undefined, @@ -307,7 +309,12 @@ const SpawnerPage: React.FC = () => { setValue={(size) => setSelectedSize(size)} sizes={sizes} /> - setSelectedGpu(size)} /> + setAccelerator('accelerator', accelerator)} + acceleratorCount={accelerator.count} + setAcceleratorCount={(acceleratorCount) => setAccelerator('count', acceleratorCount)} + /> {renderEnvironmentVariableRows()} diff --git a/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts b/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts new file mode 100644 index 0000000000..1b5c879327 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts @@ -0,0 +1,13 @@ +import useFetchState, { FetchState } from '~/utilities/useFetchState'; +import { getAcceleratorCounts } from '~/services/acceleratorService'; +import { AcceleratorInfo } from '~/types'; + +const useAcceleratorCounts = (): FetchState => + useFetchState(getAcceleratorCounts, { + available: {}, + total: {}, + allocated: {}, + configured: false, + }); + +export default useAcceleratorCounts; diff --git a/frontend/src/pages/notebookController/screens/server/useAccelerators.ts b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts new file mode 100644 index 0000000000..059349a650 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts @@ -0,0 +1,8 @@ +import useFetchState, { FetchState } from '~/utilities/useFetchState'; +import { AcceleratorKind } from '~/k8sTypes'; +import { listAccelerators } from '~/api'; + +const useAccelerators = (): FetchState => + useFetchState(listAccelerators, []); + +export default useAccelerators; diff --git a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx index 83b54db070..c950fdbbd9 100644 --- a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx +++ b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx @@ -2,7 +2,7 @@ import * as React from 'react'; import { Flex, FlexItem, Switch } from '@patternfly/react-core'; import { startNotebook, stopNotebook } from '~/api'; import { fireTrackingEvent } from '~/utilities/segmentIOUtils'; -import useNotebookGPUNumber from '~/pages/projects/screens/detail/notebooks/useNotebookGPUNumber'; +import useNotebookAccelerators from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import useNotebookDeploymentSize from '~/pages/projects/screens/detail/notebooks/useNotebookDeploymentSize'; import { computeNotebooksTolerations } from '~/utilities/tolerations'; import { useAppContext } from '~/app/AppContext'; @@ -25,7 +25,7 @@ const NotebookStatusToggle: React.FC = ({ enablePipelines, }) => { const { notebook, isStarting, isRunning, refresh } = notebookState; - const gpuNumber = useNotebookGPUNumber(notebook); + const [acceleratorData] = useNotebookAccelerators(notebook); const { size } = useNotebookDeploymentSize(notebook); const [isOpenConfirm, setOpenConfirm] = React.useState(false); const [inProgress, setInProgress] = React.useState(false); @@ -51,7 +51,10 @@ const NotebookStatusToggle: React.FC = ({ const fireNotebookTrackingEvent = React.useCallback( (action: 'started' | 'stopped') => { fireTrackingEvent(`Workbench ${action}`, { - GPU: gpuNumber, + acceleratorCount: acceleratorData.count, + accelerator: acceleratorData.accelerator + ? JSON.stringify(acceleratorData.accelerator) + : 'unknown', lastSelectedSize: size?.name || notebook.metadata.annotations?.['notebooks.opendatahub.io/last-size-selection'], @@ -64,7 +67,7 @@ const NotebookStatusToggle: React.FC = ({ }), }); }, - [gpuNumber, notebook, size], + [acceleratorData, notebook, size], ); const handleStop = React.useCallback(() => { diff --git a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts new file mode 100644 index 0000000000..14abb51958 --- /dev/null +++ b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts @@ -0,0 +1,49 @@ +import React, { useRef } from 'react'; +import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; +import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; +import { Notebook, NotebookContainer } from '~/types'; +import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; + +export type AcceleratorState = { + accelerator?: AcceleratorKind; + count: number; +}; + +const useNotebookAccelerator = ( + notebook?: NotebookKind | Notebook | null, +): GenericObjectState => { + const [acceleratorState, setData, resetData] = useGenericObjectState({ + accelerator: undefined, + count: 0, + }); + + const hasSet = useRef(false); + + const [accelerators, loaded, loadError] = useAccelerators(); + + React.useEffect(() => { + if ( + loaded && + !loadError && + notebook && + notebook?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && + !hasSet.current + ) { + notebook.spec.template; + const name = notebook.metadata.annotations['opendatahub.io/accelerator-name']; + const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); + const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( + (container) => container.name === notebook.metadata.name, + ); + if (accelerator && container) { + hasSet.current = true; + setData('accelerator', accelerator); + setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier] ?? 0)); + } + } + }, [accelerators, loaded, loadError, notebook, setData]); + + return [acceleratorState, setData, resetData]; +}; + +export default useNotebookAccelerator; diff --git a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts deleted file mode 100644 index d980f91009..0000000000 --- a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts +++ /dev/null @@ -1,14 +0,0 @@ -import { NotebookKind } from '~/k8sTypes'; -import { ContainerResourceAttributes, GPUCount, NotebookContainer } from '~/types'; - -const useNotebookGPUNumber = (notebook?: NotebookKind): GPUCount => { - const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( - (container) => container.name === notebook.metadata.name, - ); - - const gpuNumbers = container?.resources?.limits?.[ContainerResourceAttributes.NVIDIA_GPU]; - - return gpuNumbers || 0; -}; - -export default useNotebookGPUNumber; diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx index df82b5cb05..be3ebfb604 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx @@ -78,9 +78,10 @@ const SpawnerFooter: React.FC = ({ ); const afterStart = (name: string, type: 'created' | 'updated') => { - const { gpus, notebookSize, image } = startNotebookData; + const { accelerator, notebookSize, image } = startNotebookData; fireTrackingEvent(`Workbench ${type}`, { - GPU: gpus, + acceleratorCount: accelerator.count, + accelerator: accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', lastSelectedSize: notebookSize.name, lastSelectedImage: image.imageVersion?.from ? `${image.imageVersion.from.name}` diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx index ee908b2a5a..1d361873c4 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx @@ -21,14 +21,14 @@ import { getNotebookDisplayName, getProjectDisplayName, } from '~/pages/projects/utils'; -import GPUSelectField from '~/pages/notebookController/screens/server/GPUSelectField'; import { NotebookKind } from '~/k8sTypes'; import useNotebookImageData from '~/pages/projects/screens/detail/notebooks/useNotebookImageData'; import useNotebookDeploymentSize from '~/pages/projects/screens/detail/notebooks/useNotebookDeploymentSize'; -import useNotebookGPUNumber from '~/pages/projects/screens/detail/notebooks/useNotebookGPUNumber'; import NotebookRestartAlert from '~/pages/projects/components/NotebookRestartAlert'; import useWillNotebooksRestart from '~/pages/projects/notebook/useWillNotebooksRestart'; import CanEnableElyraPipelinesCheck from '~/concepts/pipelines/elyra/CanEnableElyraPipelinesCheck'; +import AcceleratorSelectField from '~/pages/notebookController/screens/server/AcceleratorSelectField'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { SpawnerPageSectionID } from './types'; import { ScrollableSelectorID, SpawnerPageSectionTitles } from './const'; import SpawnerFooter from './SpawnerFooter'; @@ -38,7 +38,11 @@ import { useNotebookSize } from './useNotebookSize'; import StorageField from './storage/StorageField'; import EnvironmentVariables from './environmentVariables/EnvironmentVariables'; import { useStorageDataObject } from './storage/utils'; -import { getRootVolumeName, useMergeDefaultPVCName } from './spawnerUtils'; +import { + getCompatibleAcceleratorIdentifiers, + getRootVolumeName, + useMergeDefaultPVCName, +} from './spawnerUtils'; import { useNotebookEnvVariables } from './environmentVariables/useNotebookEnvVariables'; import DataConnectionField from './dataConnection/DataConnectionField'; import { useNotebookDataConnection } from './dataConnection/useNotebookDataConnection'; @@ -61,7 +65,7 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { imageVersion: undefined, }); const { selectedSize, setSelectedSize, sizes } = useNotebookSize(); - const [selectedGpu, setSelectedGpu] = React.useState('0'); + const [supportedAccelerators, setSupportedAccelerators] = React.useState(); const [storageDataWithoutDefault, setStorageData] = useStorageDataObject(existingNotebook); const storageData = useMergeDefaultPVCName(storageDataWithoutDefault, nameDesc.name); const [envVariables, setEnvVariables] = useNotebookEnvVariables(existingNotebook); @@ -97,10 +101,16 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { } }, [notebookSize, setSelectedSize]); - const notebookGPU = useNotebookGPUNumber(existingNotebook); + const [notebookAcceleratorState, setNotebookAcceleratorState] = + useNotebookAccelerator(existingNotebook); + React.useEffect(() => { - setSelectedGpu(notebookGPU.toString()); - }, [notebookGPU, setSelectedGpu]); + if (selectedImage.imageStream) { + setSupportedAccelerators(getCompatibleAcceleratorIdentifiers(selectedImage.imageStream)); + } else { + setSupportedAccelerators(undefined); + } + }, [selectedImage.imageStream]); const editNotebookDisplayName = existingNotebook ? getNotebookDisplayName(existingNotebook) : ''; @@ -162,6 +172,7 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { = ({ existingNotebook }) => { setValue={setSelectedSize} value={selectedSize} /> - setSelectedGpu(value)} + + setNotebookAcceleratorState('accelerator', accelerator) + } + acceleratorCount={notebookAcceleratorState.count} + setAcceleratorCount={(acceleratorCount) => + setNotebookAcceleratorState('count', acceleratorCount) + } + supportedAccelerators={supportedAccelerators} /> = ({ existingNotebook }) => { projectName: currentProject.metadata.name, image: selectedImage, notebookSize: selectedSize, - gpus: parseInt(selectedGpu), + accelerator: notebookAcceleratorState, volumes: [], volumeMounts: [], }} diff --git a/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx b/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx index e7f6d6bf35..883908472b 100644 --- a/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx +++ b/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx @@ -17,11 +17,13 @@ import ImageStreamSelector from './ImageStreamSelector'; type ImageSelectorFieldProps = { selectedImage: ImageStreamAndVersion; setSelectedImage: React.Dispatch>; + compatibleAccelerator?: string; }; const ImageSelectorField: React.FC = ({ selectedImage, setSelectedImage, + compatibleAccelerator, }) => { const { dashboardNamespace } = useDashboardNamespace(); const buildStatuses = useBuildStatuses(dashboardNamespace); @@ -69,6 +71,7 @@ const ImageSelectorField: React.FC = ({ buildStatuses={buildStatuses} onImageStreamSelect={onImageStreamSelect} selectedImageStream={selectedImage.imageStream} + compatibleAccelerator={compatibleAccelerator} /> void; + compatibleAccelerator?: string; }; const ImageStreamSelector: React.FC = ({ @@ -22,49 +24,47 @@ const ImageStreamSelector: React.FC = ({ selectedImageStream, onImageStreamSelect, buildStatuses, + compatibleAccelerator, }) => { - const [imageSelectionOpen, setImageSelectionOpen] = React.useState(false); - - const selectOptionObjects = [...imageStreams] - .sort(compareImageStreamOrder) - .map((imageStream) => getImageStreamSelectOptionObject(imageStream)); - - const options = selectOptionObjects.map((optionObject) => { - const imageStream = optionObject.imageStream; + const options = [...imageStreams].sort(compareImageStreamOrder).map((imageStream) => { const description = getRelatedVersionDescription(imageStream); - return ( - - ); + const displayName = getImageStreamDisplayName(imageStream); + + return { + key: imageStream.metadata.name, + selectedLabel: displayName, + description: description, + disabled: !checkImageStreamAvailability(imageStream, buildStatuses), + label: ( + + {displayName} + + + {isCompatibleWithAccelerator(compatibleAccelerator, imageStream) && ( + + )} + + + ), + }; }); return ( - + /> ); }; diff --git a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts index dddafdac94..1fcef275b0 100644 --- a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts +++ b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts @@ -1,8 +1,13 @@ import * as React from 'react'; import compareVersions from 'compare-versions'; -import { K8sResourceCommon } from '@openshift/dynamic-plugin-sdk-utils'; -import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types'; -import { BuildKind, ImageStreamKind, ImageStreamSpecTagType, NotebookKind } from '~/k8sTypes'; +import { BYONImage, K8sResourceCommon, NotebookSize, Volume, VolumeMount } from '~/types'; +import { + BuildKind, + ImageStreamKind, + ImageStreamSpecTagType, + K8sDSGResource, + NotebookKind, +} from '~/k8sTypes'; import { ConfigMapCategory, DataConnectionData, @@ -17,7 +22,6 @@ import { ROOT_MOUNT_PATH } from '~/pages/projects/pvc/const'; import { AWS_FIELDS } from '~/pages/projects/dataConnections/const'; import { BuildStatus, - ImageStreamSelectOptionObjectType, ImageVersionDependencyType, ImageVersionSelectOptionObjectType, } from './types'; @@ -65,12 +69,6 @@ export const getNameVersionString = (software: ImageVersionDependencyType): stri * Create object for PF Select component to use * `toString` decides the text shown for the select option */ -export const getImageStreamSelectOptionObject = ( - imageStream: ImageStreamKind, -): ImageStreamSelectOptionObjectType => ({ - imageStream, - toString: () => getImageStreamDisplayName(imageStream), -}); export const getImageVersionSelectOptionObject = ( imageStream: ImageStreamKind, imageVersion: ImageStreamSpecTagType, @@ -79,15 +77,10 @@ export const getImageVersionSelectOptionObject = ( toString: () => `${imageVersion.name}${checkVersionRecommended(imageVersion) ? ' (Recommended)' : ''}`, }); -export const isImageStreamSelectOptionObject = ( - object: unknown, -): object is ImageStreamSelectOptionObjectType => - (object as ImageStreamSelectOptionObjectType).imageStream !== undefined; export const isImageVersionSelectOptionObject = ( object: unknown, ): object is ImageVersionSelectOptionObjectType => (object as ImageVersionSelectOptionObjectType).imageVersion !== undefined; - /******************* Compare utils for sorting *******************/ const getBuildNumber = (build: BuildKind): number => { const buildNumber = build.metadata.annotations?.['openshift.io/build.number'] || '-1'; @@ -141,6 +134,37 @@ export const getImageStreamDescription = (imageStream: ImageStreamKind): string export const getImageSteamOrder = (imageStream: ImageStreamKind): number => parseInt(imageStream.metadata.annotations?.[IMAGE_ANNOTATIONS.IMAGE_ORDER] || '100'); +export const getCompatibleAcceleratorIdentifiers = ( + object: ImageStreamKind | K8sDSGResource, +): string[] => { + try { + const annotation = object.metadata.annotations?.['opendatahub.io/recommended-accelerators']; + // in the format of ["foo.com/gpu", "bar.com/gpu"] + if (annotation) { + const identifiers = JSON.parse(annotation); + if (Array.isArray(identifiers)) { + return identifiers; + } + } + } catch (error) { + // catch invalid json in metadata + } + return []; +}; + +export const isCompatibleWithAccelerator = ( + acceleratorIdentifier?: string, + obj?: ImageStreamKind | K8sDSGResource, +) => { + if (!obj || !acceleratorIdentifier) { + return false; + } + + return getCompatibleAcceleratorIdentifiers(obj).some( + (accelerator) => accelerator === acceleratorIdentifier, + ); +}; + /** * Parse annotation software field or dependencies field from long string to array */ diff --git a/frontend/src/pages/projects/screens/spawner/types.ts b/frontend/src/pages/projects/screens/spawner/types.ts index 9f4f8c6bc3..3aec64e612 100644 --- a/frontend/src/pages/projects/screens/spawner/types.ts +++ b/frontend/src/pages/projects/screens/spawner/types.ts @@ -1,4 +1,4 @@ -import { BUILD_PHASE, ImageStreamKind, ImageStreamSpecTagType } from '~/k8sTypes'; +import { AcceleratorKind, BUILD_PHASE, ImageStreamKind, ImageStreamSpecTagType } from '~/k8sTypes'; export enum SpawnerPageSectionID { NAME_DESCRIPTION = 'name-and-description', @@ -40,3 +40,8 @@ export type ImageVersionSelectDataType = { imageStream?: ImageStreamKind; imageVersions: ImageStreamSpecTagType[]; }; + +export type AcceleratorSelectOptionObjectType = { + accelerator: AcceleratorKind; + toString: () => string; +}; diff --git a/frontend/src/pages/projects/types.ts b/frontend/src/pages/projects/types.ts index 89de7934e3..abcf01cd16 100644 --- a/frontend/src/pages/projects/types.ts +++ b/frontend/src/pages/projects/types.ts @@ -8,6 +8,7 @@ import { import { ValueOf } from '~/typeHelpers'; import { AWSSecretKind } from '~/k8sTypes'; import { AWS_KEYS } from './dataConnections/const'; +import { AcceleratorState } from './screens/detail/notebooks/useNotebookAccelerator'; export type UpdateObjectAtPropAndValue = (propKey: keyof T, propValue: ValueOf) => void; @@ -60,7 +61,7 @@ export type StartNotebookData = { projectName: string; notebookName: string; notebookSize: NotebookSize; - gpus: number; + accelerator: AcceleratorState; image: ImageStreamAndVersion; volumes?: Volume[]; volumeMounts?: VolumeMount[]; diff --git a/frontend/src/services/acceleratorService.ts b/frontend/src/services/acceleratorService.ts new file mode 100644 index 0000000000..7e7908dbb9 --- /dev/null +++ b/frontend/src/services/acceleratorService.ts @@ -0,0 +1,12 @@ +import axios from 'axios'; +import { AcceleratorInfo } from '~/types'; + +export const getAcceleratorCounts = (): Promise => { + const url = '/api/accelerators'; + return axios + .get(url) + .then((response) => response.data) + .catch((e) => { + throw new Error(e.response.data.message); + }); +}; diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 4cfc66ad41..4f3bddcb5e 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -6,6 +6,7 @@ import { ServingRuntimeSize } from '~/pages/modelServing/screens/types'; import { EnvironmentFromVariable } from '~/pages/projects/types'; import { ImageStreamKind, ImageStreamSpecTagType } from './k8sTypes'; import { EitherNotBoth } from './typeHelpers'; +import { AcceleratorState } from './pages/projects/screens/detail/notebooks/useNotebookAccelerator'; export type PrometheusQueryResponse = { data: { @@ -103,24 +104,22 @@ export type NotebookControllerUserState = { * OdhDashboardConfig contains gpuSetting as a string value override -- proper gpus return as numbers * TODO: Look to make it just number by properly parsing the value */ -export type GPUCount = string | number; export enum ContainerResourceAttributes { CPU = 'cpu', MEMORY = 'memory', - NVIDIA_GPU = 'nvidia.com/gpu', } export type ContainerResources = { requests?: { + [key: string]: number | string | undefined; cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; limits?: { + [key: string]: number | string | undefined; cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; }; @@ -329,7 +328,8 @@ export type TrackingEventProperties = { anonymousID?: string; type?: string; term?: string; - GPU?: GPUCount; + accelerator?: string; + acceleratorCount?: number; lastSelectedSize?: string; lastSelectedImage?: string; projectName?: string; @@ -344,9 +344,11 @@ export type NotebookPort = { }; export type PodToleration = { - effect: string; key: string; - operator: string; + operator?: string; + value?: string; + effect?: string; + tolerationSeconds?: number; }; export type NotebookContainer = { @@ -376,6 +378,7 @@ export type Notebook = K8sResourceCommon & { 'opendatahub.io/username': string; // the untranslated username behind the notebook 'notebooks.opendatahub.io/last-image-selection': string; // the last image they selected 'notebooks.opendatahub.io/last-size-selection': string; // the last notebook size they selected + 'opendatahub.io/accelerator-name': string | undefined; }>; labels: Partial<{ 'opendatahub.io/user': string; // translated username -- see translateUsername @@ -702,7 +705,7 @@ export type NotebookData = { notebookSizeName: string; imageName: string; imageTagName: string; - gpus: number; + accelerator: AcceleratorState; envVars: EnvVarReducedTypeKeyValues; state: NotebookState; // only used for admin calls, regular users cannot use this field @@ -737,3 +740,10 @@ export type ContextResourceData = { export type BreadcrumbItemType = { label: string; } & EitherNotBoth<{ link: string }, { isActive: boolean }>; + +export type AcceleratorInfo = { + configured: boolean; + available: { [key: string]: number }; + total: { [key: string]: number }; + allocated: { [key: string]: number }; +}; diff --git a/frontend/src/utilities/imageUtils.ts b/frontend/src/utilities/imageUtils.ts index 5870c3ac83..1177272d2e 100644 --- a/frontend/src/utilities/imageUtils.ts +++ b/frontend/src/utilities/imageUtils.ts @@ -2,13 +2,11 @@ import compareVersions from 'compare-versions'; import { BuildStatus, BUILD_PHASE, - GPUCount, ImageInfo, ImageSoftwareType, ImageTag, ImageTagInfo, NotebookContainer, - ContainerResourceAttributes, } from '~/types'; const PENDING_PHASES = [ @@ -72,9 +70,6 @@ export const getVersion = (version?: string, prefix?: string): string => { export const getNameVersionString = (software: ImageSoftwareType): string => `${software.name}${getVersion(software.version, ' v')}`; -export const getNumGpus = (container?: NotebookContainer): GPUCount => - container?.resources?.limits?.[ContainerResourceAttributes.NVIDIA_GPU] || 0; - export const getDefaultTag = ( buildStatuses: BuildStatus[], image: ImageInfo, diff --git a/frontend/src/utilities/tolerations.ts b/frontend/src/utilities/tolerations.ts index aa3e17480d..f381e9fd5a 100644 --- a/frontend/src/utilities/tolerations.ts +++ b/frontend/src/utilities/tolerations.ts @@ -1,6 +1,6 @@ import { Patch } from '@openshift/dynamic-plugin-sdk-utils'; import { DashboardConfig, PodToleration, TolerationSettings } from '~/types'; -import { NotebookKind } from '~/k8sTypes'; +import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; export type TolerationChanges = { type: 'add' | 'remove' | 'replace' | 'nothing'; @@ -8,17 +8,13 @@ export type TolerationChanges = { }; export const determineTolerations = ( - hasGpu: boolean, tolerationSettings?: TolerationSettings, + accelerator?: AcceleratorKind, ): PodToleration[] => { const tolerations: PodToleration[] = []; - if (hasGpu) { - tolerations.push({ - effect: 'NoSchedule', - key: 'nvidia.com/gpu', - operator: 'Exists', - }); + if (accelerator?.spec.tolerations) { + tolerations.push(...accelerator.spec.tolerations); } if (tolerationSettings?.enabled) { tolerations.push({ @@ -35,15 +31,9 @@ export const computeNotebooksTolerations = ( dashboardConfig: DashboardConfig, notebook: NotebookKind, ): TolerationChanges => { - const hasGPU = !!notebook.spec.template.spec.containers.find( - (container) => - !!container.resources?.limits?.['nvidia.com/gpu'] || - !!container.resources?.requests?.['nvidia.com/gpu'], - ); const tolerations = notebook.spec.template.spec.tolerations || []; const settings = determineTolerations( - hasGPU, dashboardConfig.spec.notebookController?.notebookTolerationSettings, ); diff --git a/frontend/src/utilities/useGenericObjectState.ts b/frontend/src/utilities/useGenericObjectState.ts index 0f535c5d15..ae2808256d 100644 --- a/frontend/src/utilities/useGenericObjectState.ts +++ b/frontend/src/utilities/useGenericObjectState.ts @@ -1,7 +1,7 @@ import * as React from 'react'; import { UpdateObjectAtPropAndValue } from '~/pages/projects/types'; -type GenericObjectState = [ +export type GenericObjectState = [ data: T, setData: UpdateObjectAtPropAndValue, resetDefault: () => void, diff --git a/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml b/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml deleted file mode 100644 index 3d34c5830b..0000000000 --- a/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: acceleratorprofiles.opendatahub.io -spec: - group: opendatahub.io - scope: Namespaced - names: - plural: acceleratorprofiles - singular: acceleratorprofile - kind: AcceleratorProfile - versions: - - name: v1alpha - served: true - storage: true - schema: - openAPIV3Schema: - type: object - required: - - spec - properties: - spec: - type: object - required: - - display-name - - enabled - - identifier - properties: - display-name: - type: string - enabled: - type: boolean - identifier: - type: string - description: - type: string - modifiedDate: - type: string - \ No newline at end of file diff --git a/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml index 3d34c5830b..0b429de43d 100644 --- a/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml +++ b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml @@ -1,9 +1,9 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: - name: acceleratorprofiles.opendatahub.io + name: acceleratorprofiles.dashboard.opendatahub.io spec: - group: opendatahub.io + group: dashboard.opendatahub.io scope: Namespaced names: plural: acceleratorprofiles @@ -22,11 +22,11 @@ spec: spec: type: object required: - - display-name + - displayName - enabled - identifier properties: - display-name: + displayName: type: string enabled: type: boolean @@ -34,6 +34,21 @@ spec: type: string description: type: string - modifiedDate: - type: string + tolerations: + type: array + items: + type: object + required: + - key + properties: + key: + type: string + operator: + type: string + value: + type: string + effect: + type: string + tolerationSeconds: + type: integer \ No newline at end of file From 871ca4ea464802795551ae8be1ec545674388c50 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Mon, 7 Aug 2023 09:54:20 -0500 Subject: [PATCH 09/22] Squashed commit of the following: commit 9387956a16ec1b5fef61a759fd482d591a5a6c1d Author: Gage Krumbach Date: Fri Jun 30 14:56:37 2023 -0500 added accelerator UI user flow fixed detected accelerator count connected accelerator detection added accelerator UI user flow hide accelerator dropdown when empty switched the format of the notebook identifier added accelerator name to serving runtime resource added serving runtimes accelerators --- backend/src/routes/api/gpu/gpuUtils.ts | 7 + docs/dashboard_config.md | 1 - frontend/src/api/index.ts | 1 + frontend/src/api/k8s/accelerators.ts | 8 + frontend/src/api/k8s/notebooks.ts | 7 +- frontend/src/api/k8s/servingRuntimes.ts | 9 +- frontend/src/api/k8s/utils.ts | 47 ++-- frontend/src/api/models/openShift.ts | 7 + .../src/components/SimpleDropdownSelect.scss | 3 + .../src/components/SimpleDropdownSelect.tsx | 52 +++-- frontend/src/k8sTypes.ts | 24 ++- .../projects/ServingRuntimeDetails.tsx | 11 +- .../ManageServingRuntimeModal.tsx | 4 +- .../ServingRuntimeSizeSection.tsx | 44 ++-- .../ServingRuntimeTemplateSection.tsx | 58 ++--- .../projects/useServingRuntimeAccelerator.ts | 42 ++++ .../modelServing/screens/projects/utils.ts | 19 +- .../src/pages/modelServing/screens/types.ts | 3 +- .../screens/server/AcceleratorSelectField.tsx | 200 ++++++++++++++++++ .../screens/server/NotebookServerDetails.tsx | 14 +- .../screens/server/SpawnerPage.tsx | 17 +- .../screens/server/useAcceleratorCounts.ts | 13 ++ .../screens/server/useAccelerators.ts | 8 + .../notebook/NotebookStatusToggle.tsx | 11 +- .../notebooks/useNotebookAccelerator.ts | 49 +++++ .../detail/notebooks/useNotebookGPUNumber.ts | 14 -- .../screens/spawner/SpawnerFooter.tsx | 5 +- .../projects/screens/spawner/SpawnerPage.tsx | 40 +++- .../imageSelector/ImageSelectorField.tsx | 3 + .../imageSelector/ImageStreamSelector.tsx | 74 +++---- .../projects/screens/spawner/spawnerUtils.ts | 54 +++-- .../pages/projects/screens/spawner/types.ts | 7 +- frontend/src/pages/projects/types.ts | 3 +- frontend/src/services/acceleratorService.ts | 12 ++ frontend/src/types.ts | 26 ++- frontend/src/utilities/imageUtils.ts | 5 - frontend/src/utilities/tolerations.ts | 18 +- .../src/utilities/useGenericObjectState.ts | 2 +- ...acceleratorprofile.opendatahub.io.crd.yaml | 39 ---- ...cceleratorprofiles.opendatahub.io.crd.yaml | 27 ++- 40 files changed, 708 insertions(+), 280 deletions(-) create mode 100644 frontend/src/api/k8s/accelerators.ts create mode 100644 frontend/src/components/SimpleDropdownSelect.scss create mode 100644 frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts create mode 100644 frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx create mode 100644 frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts create mode 100644 frontend/src/pages/notebookController/screens/server/useAccelerators.ts create mode 100644 frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts delete mode 100644 frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts create mode 100644 frontend/src/services/acceleratorService.ts delete mode 100644 manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts index e70cad3ece..9755a21d54 100644 --- a/backend/src/routes/api/gpu/gpuUtils.ts +++ b/backend/src/routes/api/gpu/gpuUtils.ts @@ -16,6 +16,9 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = { lastFetch: 0, }; +/** + * @deprecated + */ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => { if (storage.lastFetch >= Date.now() - 30_000) { fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`); @@ -67,11 +70,15 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => + k8sListResource({ + model: AcceleratorModel, + }).then((listResource) => listResource.items); diff --git a/frontend/src/api/k8s/notebooks.ts b/frontend/src/api/k8s/notebooks.ts index 80a87d17fb..15f825a846 100644 --- a/frontend/src/api/k8s/notebooks.ts +++ b/frontend/src/api/k8s/notebooks.ts @@ -39,7 +39,7 @@ const assembleNotebook = ( description, notebookSize, envFrom, - gpus, + accelerator, image, volumes: formVolumes, volumeMounts: formVolumeMounts, @@ -51,7 +51,7 @@ const assembleNotebook = ( const { affinity, tolerations, resources } = assemblePodSpecOptions( notebookSize.resources, - gpus, + accelerator, tolerationSettings, ); @@ -88,6 +88,7 @@ const assembleNotebook = ( 'notebooks.opendatahub.io/last-image-selection': imageSelection, 'notebooks.opendatahub.io/inject-oauth': 'true', 'opendatahub.io/username': username, + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }, name: notebookId, namespace: projectName, @@ -260,7 +261,7 @@ export const updateNotebook = ( // clean the envFrom array in case of merging the old value again container.envFrom = []; - // clean the resources, affinity and tolerations for GPU + // clean the resources, affinity and tolerations for accelerator oldNotebook.spec.template.spec.tolerations = []; oldNotebook.spec.template.spec.affinity = {}; container.resources = {}; diff --git a/frontend/src/api/k8s/servingRuntimes.ts b/frontend/src/api/k8s/servingRuntimes.ts index 07a0b2cc15..9b3998f788 100644 --- a/frontend/src/api/k8s/servingRuntimes.ts +++ b/frontend/src/api/k8s/servingRuntimes.ts @@ -23,7 +23,7 @@ const assembleServingRuntime = ( isCustomServingRuntimesEnabled: boolean, isEditing?: boolean, ): ServingRuntimeKind => { - const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth, gpus } = data; + const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth, accelerator } = data; const createName = isCustomServingRuntimesEnabled ? translateDisplayNameForK8s(displayName) : getModelServingRuntimeName(namespace); @@ -50,6 +50,7 @@ const assembleServingRuntime = ( }), ...(isCustomServingRuntimesEnabled && { 'opendatahub.io/template-display-name': getDisplayNameFromK8sResource(servingRuntime), + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }), }, }; @@ -60,6 +61,7 @@ const assembleServingRuntime = ( ...updatedServingRuntime.metadata.annotations, 'enable-route': externalRoute ? 'true' : 'false', 'enable-auth': tokenAuth ? 'true' : 'false', + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', ...(isCustomServingRuntimesEnabled && { 'openshift.io/display-name': displayName }), }, }; @@ -77,7 +79,10 @@ const assembleServingRuntime = ( }, }; - const { affinity, tolerations, resources } = assemblePodSpecOptions(resourceSettings, gpus); + const { affinity, tolerations, resources } = assemblePodSpecOptions( + resourceSettings, + accelerator, + ); updatedServingRuntime.spec.containers = servingRuntime.spec.containers.map((container) => ({ ...container, diff --git a/frontend/src/api/k8s/utils.ts b/frontend/src/api/k8s/utils.ts index 920415d757..883df66e5c 100644 --- a/frontend/src/api/k8s/utils.ts +++ b/frontend/src/api/k8s/utils.ts @@ -1,3 +1,4 @@ +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { PodAffinity, ContainerResources, @@ -9,7 +10,7 @@ import { determineTolerations } from '~/utilities/tolerations'; export const assemblePodSpecOptions = ( resourceSettings: ContainerResources, - gpus: number, + accelerator: AcceleratorState, tolerationSettings?: TolerationSettings, affinitySettings?: PodAffinity, ): { @@ -17,40 +18,34 @@ export const assemblePodSpecOptions = ( tolerations: PodToleration[]; resources: ContainerResources; } => { - let affinity: PodAffinity = structuredClone(affinitySettings || {}); + const affinity: PodAffinity = structuredClone(affinitySettings || {}); const resources = structuredClone(resourceSettings); - if (gpus > 0) { + if (accelerator.count > 0 && accelerator.accelerator) { if (!resources.limits) { resources.limits = {}; } if (!resources.requests) { resources.requests = {}; } - resources.limits[ContainerResourceAttributes.NVIDIA_GPU] = gpus; - resources.requests[ContainerResourceAttributes.NVIDIA_GPU] = gpus; + resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; + resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { - delete resources.limits?.[ContainerResourceAttributes.NVIDIA_GPU]; - delete resources.requests?.[ContainerResourceAttributes.NVIDIA_GPU]; - affinity = { - nodeAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - { - preference: { - matchExpressions: [ - { - key: 'nvidia.com/gpu.present', - operator: 'NotIn', - values: ['true'], - }, - ], - }, - weight: 1, - }, - ], - }, - }; + // step type down to string to avoid type errors + const containerResourceKeys: string[] = Object.keys(ContainerResourceAttributes); + + Object.keys(resources.limits || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.limits?.[key]; + } + }); + + Object.keys(resources.requests || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.requests?.[key]; + } + }); } - const tolerations = determineTolerations(gpus > 0, tolerationSettings); + const tolerations = determineTolerations(tolerationSettings, accelerator.accelerator); return { affinity, tolerations, resources }; }; diff --git a/frontend/src/api/models/openShift.ts b/frontend/src/api/models/openShift.ts index 543ff77a38..3d4ecf4735 100644 --- a/frontend/src/api/models/openShift.ts +++ b/frontend/src/api/models/openShift.ts @@ -55,3 +55,10 @@ export const TemplateModel: K8sModelCommon = { kind: 'Template', plural: 'templates', }; + +export const AcceleratorModel: K8sModelCommon = { + apiVersion: 'v1alpha', + apiGroup: 'dashboard.opendatahub.io', + kind: 'AcceleratorProfile', + plural: 'acceleratorprofiles', +}; diff --git a/frontend/src/components/SimpleDropdownSelect.scss b/frontend/src/components/SimpleDropdownSelect.scss new file mode 100644 index 0000000000..bcb8baf49f --- /dev/null +++ b/frontend/src/components/SimpleDropdownSelect.scss @@ -0,0 +1,3 @@ +.full-width { + width: 100%; +} \ No newline at end of file diff --git a/frontend/src/components/SimpleDropdownSelect.tsx b/frontend/src/components/SimpleDropdownSelect.tsx index c1d0775549..a987e0d270 100644 --- a/frontend/src/components/SimpleDropdownSelect.tsx +++ b/frontend/src/components/SimpleDropdownSelect.tsx @@ -1,11 +1,20 @@ import * as React from 'react'; import { Dropdown, DropdownItem, DropdownToggle } from '@patternfly/react-core'; +import './SimpleDropdownSelect.scss'; type SimpleDropdownProps = { - options: { key: string; label: React.ReactNode }[]; + options: { + key: string; + label: React.ReactNode; + description?: React.ReactNode; + selectedLabel?: React.ReactNode; + isPlaceholder?: boolean; + }[]; value: string; placeholder?: string; - onChange: (key: string) => void; + onChange: (key: string, isPlaceholder: boolean) => void; + isFullWidth?: boolean; + isDisabled?: boolean; } & Omit, 'isOpen' | 'toggle' | 'dropdownItems' | 'onChange'>; const SimpleDropdownSelect: React.FC = ({ @@ -13,30 +22,43 @@ const SimpleDropdownSelect: React.FC = ({ options, placeholder = 'Select...', value, + isFullWidth, + isDisabled, ...props }) => { const [open, setOpen] = React.useState(false); + const selectedOption = options.find(({ key }) => key === value); + const selectedLabel = selectedOption?.selectedLabel ?? selectedOption?.label ?? placeholder; + return ( setOpen(!open)}> - <>{options.find(({ key }) => key === value)?.label ?? placeholder} + setOpen(!open)} + > + <>{selectedLabel} } - dropdownItems={options.map(({ key, label }) => ( - { - onChange(key); - setOpen(false); - }} - > - {label} - - ))} + dropdownItems={options + .sort((a, b) => (a.isPlaceholder === b.isPlaceholder ? 0 : a.isPlaceholder ? -1 : 1)) + .map(({ key, label, description, isPlaceholder }) => ( + { + onChange(key, !!isPlaceholder); + setOpen(false); + }} + > + {isPlaceholder ? {label} : label} + + ))} /> ); }; diff --git a/frontend/src/k8sTypes.ts b/frontend/src/k8sTypes.ts index dc2b3f209b..00774fbedc 100644 --- a/frontend/src/k8sTypes.ts +++ b/frontend/src/k8sTypes.ts @@ -43,7 +43,10 @@ type DisplayNameAnnotations = Partial<{ export type K8sDSGResource = K8sResourceCommon & { metadata: { - annotations?: DisplayNameAnnotations; + annotations?: DisplayNameAnnotations & + Partial<{ + 'opendatahub.io/recommended-accelerators': string; + }>; name: string; }; }; @@ -69,6 +72,7 @@ export type NotebookAnnotations = Partial<{ 'opendatahub.io/username': string; // the untranslated username behind the notebook 'notebooks.opendatahub.io/last-image-selection': string; // the last image they selected 'notebooks.opendatahub.io/last-size-selection': string; // the last notebook size they selected + 'opendatahub.io/accelerator-name': string; // the accelerator attached to the notebook }>; export type DashboardLabels = { @@ -91,6 +95,8 @@ export type ServingRuntimeAnnotations = Partial<{ 'opendatahub.io/template-name': string; 'opendatahub.io/template-display-name': string; 'opendatahub.io/disable-gpu': string; + 'opendatahub.io/recommended-accelerators': string; + 'opendatahub.io/accelerator-name': string; 'enable-route': string; 'enable-auth': string; }>; @@ -715,3 +721,19 @@ export type DashboardConfigKind = K8sResourceCommon & { templateOrder?: string[]; }; }; + +export type AcceleratorKind = K8sResourceCommon & { + metadata: { + name: string; + annotations?: Partial<{ + 'opendatahub.io/modified-date': string; + }>; + }; + spec: { + displayName: string; + enabled: boolean; + identifier: string; + description?: string; + tolerations?: PodToleration[]; + }; +}; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx index f50ab5d8ce..d80cfd7ad7 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx @@ -10,8 +10,8 @@ import { } from '@patternfly/react-core'; import { ServingRuntimeKind } from '~/k8sTypes'; import { AppContext } from '~/app/AppContext'; -import { ContainerResourceAttributes } from '~/types'; import { getServingRuntimeSizes } from './utils'; +import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; type ServingRuntimeDetailsProps = { obj: ServingRuntimeKind; @@ -22,6 +22,7 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => const container = obj.spec.containers[0]; // can we assume the first container? const sizes = getServingRuntimeSizes(dashboardConfig); const size = sizes.find((size) => _.isEqual(size.resources, container.resources)); + const [accelerator] = useServingRuntimeAccelerator(obj); return ( @@ -44,11 +45,15 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => - Number of GPUs + Accelerator - {container.resources.limits?.[ContainerResourceAttributes.NVIDIA_GPU] || 0} + {accelerator.accelerator?.spec.displayName || 'unknown'} + + Number of accelerators + {accelerator.count} + ); }; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx index a9ae72648b..393bae629d 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx @@ -127,7 +127,9 @@ const ManageServingRuntimeModal: React.FC = ({ } const servingRuntimeData = { ...createData, - gpus: isGpuDisabled(servingRuntimeSelected) ? 0 : createData.gpus, + accelerator: isGpuDisabled(servingRuntimeSelected) + ? { accelerator: undefined, count: 0 } + : createData.accelerator, }; const servingRuntimeName = translateDisplayNameForK8s(servingRuntimeData.name); const createRolebinding = servingRuntimeData.tokenAuth && allowCreate; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx index bd04dad8d4..26432ffca8 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx @@ -2,7 +2,6 @@ import * as React from 'react'; import { FormGroup, FormSection, - NumberInput, Select, SelectOption, Stack, @@ -13,9 +12,10 @@ import { CreatingServingRuntimeObject, ServingRuntimeSize, } from '~/pages/modelServing/screens/types'; -import useGPUSetting from '~/pages/notebookController/screens/server/useGPUSetting'; import { ServingRuntimeKind } from '~/k8sTypes'; import { isGpuDisabled } from '~/pages/modelServing/screens/projects/utils'; +import AcceleratorSelectField from '~/pages/notebookController/screens/server/AcceleratorSelectField'; +import { getCompatibleAcceleratorIdentifiers } from '~/pages/projects/screens/spawner/spawnerUtils'; import ServingRuntimeSizeExpandedField from './ServingRuntimeSizeExpandedField'; type ServingRuntimeSizeSectionProps = { @@ -32,7 +32,15 @@ const ServingRuntimeSizeSection: React.FC = ({ servingRuntimeSelected, }) => { const [sizeDropdownOpen, setSizeDropdownOpen] = React.useState(false); - const { available: gpuAvailable, count: gpuCount } = useGPUSetting('autodetect'); + const [supportedAccelerators, setSupportedAccelerators] = React.useState(); + + React.useEffect(() => { + if (servingRuntimeSelected) { + setSupportedAccelerators(getCompatibleAcceleratorIdentifiers(servingRuntimeSelected)); + } else { + setSupportedAccelerators(undefined); + } + }, [servingRuntimeSelected]); const gpuDisabled = servingRuntimeSelected ? isGpuDisabled(servingRuntimeSelected) : false; @@ -88,25 +96,17 @@ const ServingRuntimeSizeSection: React.FC = ({ )} - {gpuAvailable && !gpuDisabled && ( - - ) => { - const target = event.currentTarget; - setData('gpus', parseInt(target.value) || 0); - }} - onBlur={(event: React.FormEvent) => { - const target = event.currentTarget; - const gpuInput = parseInt(target.value) || 0; - setData('gpus', Math.max(0, Math.min(gpuCount, gpuInput))); - }} - onMinus={() => setData('gpus', data.gpus - 1)} - onPlus={() => setData('gpus', data.gpus + 1)} + {!gpuDisabled && ( + + + setData('accelerator', { ...data.accelerator, accelerator }) + } + setAcceleratorCount={(count) => setData('accelerator', { ...data.accelerator, count })} + supportedAccelerators={supportedAccelerators} + supportedText="Compatible with serving runtime" /> )} diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx index 0b8f9ddd46..fddc781f66 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx @@ -1,5 +1,5 @@ import * as React from 'react'; -import { FormGroup, Select, SelectOption, StackItem, TextInput } from '@patternfly/react-core'; +import { FormGroup, Label, Split, SplitItem, StackItem, TextInput } from '@patternfly/react-core'; import { UpdateObjectAtPropAndValue } from '~/pages/projects/types'; import { CreatingServingRuntimeObject } from '~/pages/modelServing/screens/types'; import { TemplateKind } from '~/k8sTypes'; @@ -7,6 +7,8 @@ import { getServingRuntimeDisplayNameFromTemplate, getServingRuntimeNameFromTemplate, } from '~/pages/modelServing/customServingRuntimes/utils'; +import { isCompatibleWithAccelerator } from '~/pages/projects/screens/spawner/spawnerUtils'; +import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; type ServingRuntimeTemplateSectionProps = { data: CreatingServingRuntimeObject; @@ -21,16 +23,22 @@ const ServingRuntimeTemplateSection: React.FC { - const [isOpen, setOpen] = React.useState(false); - - const options = templates.map((template) => ( - - {getServingRuntimeDisplayNameFromTemplate(template)} - - )); + const options = templates.map((template) => ({ + key: getServingRuntimeNameFromTemplate(template), + selectedLabel: getServingRuntimeDisplayNameFromTemplate(template), + label: ( + + {getServingRuntimeDisplayNameFromTemplate(template)} + + + {isCompatibleWithAccelerator( + data.accelerator.accelerator?.spec.identifier, + template.objects[0], + ) && } + + + ), + })); return ( <> @@ -46,22 +54,20 @@ const ServingRuntimeTemplateSection: React.FC - + id="serving-runtime-template-selection" + aria-label="Select a template" + options={options} + placeholder={ + isEditing || templates.length === 0 ? data.servingRuntimeTemplateName : 'Select one' + } + value={data.servingRuntimeTemplateName ?? ''} + onChange={(name) => { + setData('servingRuntimeTemplateName', name); + }} + /> diff --git a/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts b/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts new file mode 100644 index 0000000000..fae8b158b9 --- /dev/null +++ b/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts @@ -0,0 +1,42 @@ +import React, { useRef } from 'react'; +import { ServingRuntimeKind } from '~/k8sTypes'; +import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; +import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; + +const useServingRuntimeAccelerator = ( + servingRuntime?: ServingRuntimeKind, +): GenericObjectState => { + const [acceleratorState, setData, resetData] = useGenericObjectState({ + accelerator: undefined, + count: 0, + }); + + const hasSet = useRef(false); + + const [accelerators, loaded, loadError] = useAccelerators(); + + React.useEffect(() => { + if ( + loaded && + !loadError && + servingRuntime && + servingRuntime?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && + !hasSet.current + ) { + const name = servingRuntime.metadata.annotations['opendatahub.io/accelerator-name']; + const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); + const container = servingRuntime?.spec.containers[0]; + + if (accelerator && container) { + hasSet.current = true; + setData('accelerator', accelerator); + setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier]) ?? 0); + } + } + }, [accelerators, loaded, loadError, servingRuntime, setData]); + + return [acceleratorState, setData, resetData]; +}; + +export default useServingRuntimeAccelerator; diff --git a/frontend/src/pages/modelServing/screens/projects/utils.ts b/frontend/src/pages/modelServing/screens/projects/utils.ts index 616789f8c1..54ef4e6d20 100644 --- a/frontend/src/pages/modelServing/screens/projects/utils.ts +++ b/frontend/src/pages/modelServing/screens/projects/utils.ts @@ -8,7 +8,7 @@ import { InferenceServiceStorageType, ServingRuntimeSize, } from '~/pages/modelServing/screens/types'; -import { ContainerResourceAttributes, DashboardConfig } from '~/types'; +import { DashboardConfig } from '~/types'; import { DEFAULT_MODEL_SERVER_SIZES } from '~/pages/modelServing/screens/const'; import { useAppContext } from '~/app/AppContext'; import { useDeepCompareMemoize } from '~/utilities/useDeepCompareMemoize'; @@ -16,6 +16,7 @@ import { EMPTY_AWS_SECRET_DATA } from '~/pages/projects/dataConnections/const'; import { getDisplayNameFromK8sResource } from '~/pages/projects/utils'; import { getDisplayNameFromServingRuntimeTemplate } from '~/pages/modelServing/customServingRuntimes/utils'; import { isCpuLimitEqual, isMemoryLimitEqual } from '~/utilities/valueUnits'; +import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; export const getServingRuntimeSizes = (config: DashboardConfig): ServingRuntimeSize[] => { let sizes = config.spec.modelServerSizes || []; @@ -54,6 +55,8 @@ export const useCreateServingRuntimeObject = (existingData?: { ] => { const { dashboardConfig } = useAppContext(); + const [existingAccelerator] = useServingRuntimeAccelerator(existingData?.servingRuntime); + const sizes = useDeepCompareMemoize(getServingRuntimeSizes(dashboardConfig)); const createModelState = useGenericObjectState({ @@ -61,7 +64,7 @@ export const useCreateServingRuntimeObject = (existingData?: { servingRuntimeTemplateName: '', numReplicas: 1, modelSize: sizes[0], - gpus: 0, + accelerator: existingAccelerator, externalRoute: false, tokenAuth: false, tokens: [], @@ -82,11 +85,6 @@ export const useCreateServingRuntimeObject = (existingData?: { const existingResources = existingData?.servingRuntime?.spec?.containers[0]?.resources || sizes[0].resources; - const existingGpus = - existingData?.servingRuntime?.spec?.containers[0]?.resources?.requests?.[ - ContainerResourceAttributes.NVIDIA_GPU - ] || 0; - const existingExternalRoute = existingData?.servingRuntime?.metadata.annotations?.['enable-route'] === 'true'; const existingTokenAuth = @@ -118,10 +116,7 @@ export const useCreateServingRuntimeObject = (existingData?: { resources: existingResources, }, ); - setCreateData( - 'gpus', - typeof existingGpus == 'string' ? parseInt(existingGpus) : existingGpus, - ); + setCreateData('accelerator', existingAccelerator); setCreateData('externalRoute', existingExternalRoute); setCreateData('tokenAuth', existingTokenAuth); setCreateData('tokens', existingTokens); @@ -131,7 +126,7 @@ export const useCreateServingRuntimeObject = (existingData?: { existingServingRuntimeTemplateName, existingNumReplicas, existingResources, - existingGpus, + existingAccelerator, existingExternalRoute, existingTokenAuth, existingTokens, diff --git a/frontend/src/pages/modelServing/screens/types.ts b/frontend/src/pages/modelServing/screens/types.ts index 7b66c456c5..557f7bda26 100644 --- a/frontend/src/pages/modelServing/screens/types.ts +++ b/frontend/src/pages/modelServing/screens/types.ts @@ -1,3 +1,4 @@ +import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { EnvVariableDataEntry } from '~/pages/projects/types'; import { ContainerResources } from '~/types'; @@ -34,7 +35,7 @@ export type CreatingServingRuntimeObject = { servingRuntimeTemplateName: string; numReplicas: number; modelSize: ServingRuntimeSize; - gpus: number; + accelerator: AcceleratorState; externalRoute: boolean; tokenAuth: boolean; tokens: ServingRuntimeToken[]; diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx new file mode 100644 index 0000000000..dcb0d45f62 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -0,0 +1,200 @@ +import * as React from 'react'; +import { + Alert, + AlertVariant, + FormGroup, + InputGroup, + Label, + NumberInput, + Split, + SplitItem, + Stack, + StackItem, +} from '@patternfly/react-core'; +import { isHTMLInputElement } from '~/utilities/utils'; +import { AcceleratorKind } from '~/k8sTypes'; +import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; +import useAccelerators from './useAccelerators'; +import useAcceleratorCounts from './useAcceleratorCounts'; + +type AcceleratorSelectFieldProps = { + accelerator?: AcceleratorKind; + setAccelerator: (accelerator?: AcceleratorKind) => void; + acceleratorCount?: number; + setAcceleratorCount: (size: number) => void; + supportedAccelerators?: string[]; + supportedText?: string; +}; + +const AcceleratorSelectField: React.FC = ({ + accelerator, + setAccelerator, + acceleratorCount = 0, + setAcceleratorCount, + supportedAccelerators, + supportedText, +}) => { + const [accelerators, loaded, loadError] = useAccelerators(); + const [detectedAcceleratorInfo] = useAcceleratorCounts(); + + const validateAcceleratorCount = React.useCallback( + (newSize: number) => { + if (!accelerator) { + return ''; + } + + const detectedAcceleratorCount = Object.entries(detectedAcceleratorInfo.available).find( + ([identifier]) => accelerator?.spec.identifier === identifier, + )?.[1]; + + if (detectedAcceleratorCount === undefined) { + return `No accelerator detected with the identifier ${accelerator?.spec.identifier} detected.`; + } else if (newSize > detectedAcceleratorCount) { + return `Only ${detectedAcceleratorCount} accelerator${ + detectedAcceleratorCount > 1 ? 's' : '' + } detected.`; + } + + return ''; + }, + [accelerator, detectedAcceleratorInfo.available], + ); + + React.useEffect(() => { + if (acceleratorCount > 0) { + setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); + } + }, [acceleratorCount, validateAcceleratorCount]); + + const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState( + validateAcceleratorCount(acceleratorCount), + ); + + const isAcceleratorSupported = (accelerator: AcceleratorKind) => + supportedAccelerators?.includes(accelerator.spec.identifier); + + const enabledAccelerators = accelerators.filter((ac) => ac.spec.enabled); + + const options = enabledAccelerators + .sort((a, b) => { + const aSupported = isAcceleratorSupported(a); + const bSupported = isAcceleratorSupported(b); + if (aSupported && !bSupported) { + return -1; + } + if (!aSupported && bSupported) { + return 1; + } + return 0; + }) + .map((ac) => ({ + key: ac.metadata.name, + selectedLabel: ac.spec.displayName, + description: ac.spec.description, + label: ( + + {ac.spec.displayName} + + + {isAcceleratorSupported(ac) && ( + + )} + + + ), + })); + + let acceleratorAlertMessage: { title: string; variant: AlertVariant } | null = null; + if (accelerator && supportedAccelerators !== undefined) { + if (supportedAccelerators?.length === 0) { + acceleratorAlertMessage = { + title: + "The image you have selected doesn't support the selected accelerator. It is recommended to use a compatible image for optimal performance.", + variant: AlertVariant.info, + }; + } else if (!isAcceleratorSupported(accelerator)) { + acceleratorAlertMessage = { + title: 'The image you have selected is not compatible with the selected accelerator', + variant: AlertVariant.warning, + }; + } + } + + const onStep = (step: number) => { + setAcceleratorCount(Math.max(acceleratorCount + step, 0)); + }; + + if (!loaded || loadError || enabledAccelerators.length === 0) { + return <>; + } + + return ( + + + + { + if (isPlaceholder) { + setAccelerator(undefined); + setAcceleratorCount(0); + } else { + setAccelerator(accelerators.find((ac) => ac.metadata.name === key)); + } + }} + > + + + {acceleratorAlertMessage && ( + + + + )} + {accelerator && ( + + + + onStep(1)} + onMinus={() => onStep(-1)} + onChange={(event) => { + if (isHTMLInputElement(event.target)) { + const newSize = Number(event.target.value); + setAcceleratorCount(newSize); + } + }} + /> + + + + )} + {acceleratorCountWarning && ( + + + + )} + + ); +}; + +export default AcceleratorSelectField; diff --git a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx index e111770c84..1dfb12a76a 100644 --- a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx +++ b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx @@ -16,11 +16,11 @@ import { getDescriptionForTag, getImageTagByContainer, getNameVersionString, - getNumGpus, } from '~/utilities/imageUtils'; import { useAppContext } from '~/app/AppContext'; import { useWatchImages } from '~/utilities/useWatchImages'; import { NotebookControllerContext } from '~/pages/notebookController/NotebookControllerContext'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { getNotebookSizes } from './usePreferredNotebookSize'; const NotebookServerDetails: React.FC = () => { @@ -28,6 +28,7 @@ const NotebookServerDetails: React.FC = () => { const { images, loaded } = useWatchImages(); const [isExpanded, setExpanded] = React.useState(false); const { dashboardConfig } = useAppContext(); + const [accelerator] = useNotebookAccelerator(notebook); const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( (container) => container.name === notebook.metadata.name, @@ -45,7 +46,6 @@ const NotebookServerDetails: React.FC = () => { const tagSoftware = getDescriptionForTag(tag); const tagDependencies = tag?.content.dependencies ?? []; - const numGpus = getNumGpus(container); const sizes = getNotebookSizes(dashboardConfig); const size = sizes.find((size) => _.isEqual(size.resources.limits, container.resources?.limits)); @@ -106,8 +106,14 @@ const NotebookServerDetails: React.FC = () => { {`${container.resources?.requests?.cpu} CPU, ${container.resources?.requests?.memory} Memory`} - Number of GPUs - {numGpus} + Accelerator + + {accelerator.accelerator?.spec.displayName || 'unknown'} + + + + Number of accelerators + {accelerator.count} diff --git a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx index 71a8e6ade5..985773e1e3 100644 --- a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx +++ b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx @@ -39,7 +39,7 @@ import ImpersonateAlert from '~/pages/notebookController/screens/admin/Impersona import useNamespaces from '~/pages/notebookController/useNamespaces'; import { fireTrackingEvent } from '~/utilities/segmentIOUtils'; import { getEnvConfigMap, getEnvSecret } from '~/services/envService'; -import GPUSelectField from './GPUSelectField'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import SizeSelectField from './SizeSelectField'; import useSpawnerNotebookModalState from './useSpawnerNotebookModalState'; import BrowserTabPreferenceCheckbox from './BrowserTabPreferenceCheckbox'; @@ -49,6 +49,7 @@ import { usePreferredNotebookSize } from './usePreferredNotebookSize'; import StartServerModal from './StartServerModal'; import '~/pages/notebookController/NotebookController.scss'; +import AcceleratorSelectField from './AcceleratorSelectField'; const SpawnerPage: React.FC = () => { const navigate = useNavigate(); @@ -68,7 +69,7 @@ const SpawnerPage: React.FC = () => { tag: undefined, }); const { selectedSize, setSelectedSize, sizes } = usePreferredNotebookSize(); - const [selectedGpu, setSelectedGpu] = React.useState('0'); + const [accelerator, setAccelerator] = useNotebookAccelerator(currentUserNotebook); const [variableRows, setVariableRows] = React.useState([]); const [submitError, setSubmitError] = React.useState(null); @@ -231,7 +232,8 @@ const SpawnerPage: React.FC = () => { const fireStartServerEvent = () => { fireTrackingEvent('Notebook Server Started', { - GPU: parseInt(selectedGpu), + accelerator: accelerator.accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', + acceleratorCount: accelerator.count, lastSelectedSize: selectedSize.name, lastSelectedImage: `${selectedImageTag.image?.name}:${selectedImageTag.tag?.name}`, }); @@ -246,7 +248,7 @@ const SpawnerPage: React.FC = () => { notebookSizeName: selectedSize.name, imageName: selectedImageTag.image?.name || '', imageTagName: selectedImageTag.tag?.name || '', - gpus: parseInt(selectedGpu), + accelerator: accelerator, envVars: envVars, state: NotebookState.Started, username: impersonatedUsername || undefined, @@ -307,7 +309,12 @@ const SpawnerPage: React.FC = () => { setValue={(size) => setSelectedSize(size)} sizes={sizes} /> - setSelectedGpu(size)} /> + setAccelerator('accelerator', accelerator)} + acceleratorCount={accelerator.count} + setAcceleratorCount={(acceleratorCount) => setAccelerator('count', acceleratorCount)} + /> {renderEnvironmentVariableRows()} diff --git a/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts b/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts new file mode 100644 index 0000000000..1b5c879327 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/useAcceleratorCounts.ts @@ -0,0 +1,13 @@ +import useFetchState, { FetchState } from '~/utilities/useFetchState'; +import { getAcceleratorCounts } from '~/services/acceleratorService'; +import { AcceleratorInfo } from '~/types'; + +const useAcceleratorCounts = (): FetchState => + useFetchState(getAcceleratorCounts, { + available: {}, + total: {}, + allocated: {}, + configured: false, + }); + +export default useAcceleratorCounts; diff --git a/frontend/src/pages/notebookController/screens/server/useAccelerators.ts b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts new file mode 100644 index 0000000000..059349a650 --- /dev/null +++ b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts @@ -0,0 +1,8 @@ +import useFetchState, { FetchState } from '~/utilities/useFetchState'; +import { AcceleratorKind } from '~/k8sTypes'; +import { listAccelerators } from '~/api'; + +const useAccelerators = (): FetchState => + useFetchState(listAccelerators, []); + +export default useAccelerators; diff --git a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx index 83b54db070..c950fdbbd9 100644 --- a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx +++ b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx @@ -2,7 +2,7 @@ import * as React from 'react'; import { Flex, FlexItem, Switch } from '@patternfly/react-core'; import { startNotebook, stopNotebook } from '~/api'; import { fireTrackingEvent } from '~/utilities/segmentIOUtils'; -import useNotebookGPUNumber from '~/pages/projects/screens/detail/notebooks/useNotebookGPUNumber'; +import useNotebookAccelerators from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import useNotebookDeploymentSize from '~/pages/projects/screens/detail/notebooks/useNotebookDeploymentSize'; import { computeNotebooksTolerations } from '~/utilities/tolerations'; import { useAppContext } from '~/app/AppContext'; @@ -25,7 +25,7 @@ const NotebookStatusToggle: React.FC = ({ enablePipelines, }) => { const { notebook, isStarting, isRunning, refresh } = notebookState; - const gpuNumber = useNotebookGPUNumber(notebook); + const [acceleratorData] = useNotebookAccelerators(notebook); const { size } = useNotebookDeploymentSize(notebook); const [isOpenConfirm, setOpenConfirm] = React.useState(false); const [inProgress, setInProgress] = React.useState(false); @@ -51,7 +51,10 @@ const NotebookStatusToggle: React.FC = ({ const fireNotebookTrackingEvent = React.useCallback( (action: 'started' | 'stopped') => { fireTrackingEvent(`Workbench ${action}`, { - GPU: gpuNumber, + acceleratorCount: acceleratorData.count, + accelerator: acceleratorData.accelerator + ? JSON.stringify(acceleratorData.accelerator) + : 'unknown', lastSelectedSize: size?.name || notebook.metadata.annotations?.['notebooks.opendatahub.io/last-size-selection'], @@ -64,7 +67,7 @@ const NotebookStatusToggle: React.FC = ({ }), }); }, - [gpuNumber, notebook, size], + [acceleratorData, notebook, size], ); const handleStop = React.useCallback(() => { diff --git a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts new file mode 100644 index 0000000000..14abb51958 --- /dev/null +++ b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts @@ -0,0 +1,49 @@ +import React, { useRef } from 'react'; +import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; +import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; +import { Notebook, NotebookContainer } from '~/types'; +import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; + +export type AcceleratorState = { + accelerator?: AcceleratorKind; + count: number; +}; + +const useNotebookAccelerator = ( + notebook?: NotebookKind | Notebook | null, +): GenericObjectState => { + const [acceleratorState, setData, resetData] = useGenericObjectState({ + accelerator: undefined, + count: 0, + }); + + const hasSet = useRef(false); + + const [accelerators, loaded, loadError] = useAccelerators(); + + React.useEffect(() => { + if ( + loaded && + !loadError && + notebook && + notebook?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && + !hasSet.current + ) { + notebook.spec.template; + const name = notebook.metadata.annotations['opendatahub.io/accelerator-name']; + const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); + const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( + (container) => container.name === notebook.metadata.name, + ); + if (accelerator && container) { + hasSet.current = true; + setData('accelerator', accelerator); + setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier] ?? 0)); + } + } + }, [accelerators, loaded, loadError, notebook, setData]); + + return [acceleratorState, setData, resetData]; +}; + +export default useNotebookAccelerator; diff --git a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts deleted file mode 100644 index d980f91009..0000000000 --- a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookGPUNumber.ts +++ /dev/null @@ -1,14 +0,0 @@ -import { NotebookKind } from '~/k8sTypes'; -import { ContainerResourceAttributes, GPUCount, NotebookContainer } from '~/types'; - -const useNotebookGPUNumber = (notebook?: NotebookKind): GPUCount => { - const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( - (container) => container.name === notebook.metadata.name, - ); - - const gpuNumbers = container?.resources?.limits?.[ContainerResourceAttributes.NVIDIA_GPU]; - - return gpuNumbers || 0; -}; - -export default useNotebookGPUNumber; diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx index df82b5cb05..be3ebfb604 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx @@ -78,9 +78,10 @@ const SpawnerFooter: React.FC = ({ ); const afterStart = (name: string, type: 'created' | 'updated') => { - const { gpus, notebookSize, image } = startNotebookData; + const { accelerator, notebookSize, image } = startNotebookData; fireTrackingEvent(`Workbench ${type}`, { - GPU: gpus, + acceleratorCount: accelerator.count, + accelerator: accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', lastSelectedSize: notebookSize.name, lastSelectedImage: image.imageVersion?.from ? `${image.imageVersion.from.name}` diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx index ee908b2a5a..1d361873c4 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx @@ -21,14 +21,14 @@ import { getNotebookDisplayName, getProjectDisplayName, } from '~/pages/projects/utils'; -import GPUSelectField from '~/pages/notebookController/screens/server/GPUSelectField'; import { NotebookKind } from '~/k8sTypes'; import useNotebookImageData from '~/pages/projects/screens/detail/notebooks/useNotebookImageData'; import useNotebookDeploymentSize from '~/pages/projects/screens/detail/notebooks/useNotebookDeploymentSize'; -import useNotebookGPUNumber from '~/pages/projects/screens/detail/notebooks/useNotebookGPUNumber'; import NotebookRestartAlert from '~/pages/projects/components/NotebookRestartAlert'; import useWillNotebooksRestart from '~/pages/projects/notebook/useWillNotebooksRestart'; import CanEnableElyraPipelinesCheck from '~/concepts/pipelines/elyra/CanEnableElyraPipelinesCheck'; +import AcceleratorSelectField from '~/pages/notebookController/screens/server/AcceleratorSelectField'; +import useNotebookAccelerator from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { SpawnerPageSectionID } from './types'; import { ScrollableSelectorID, SpawnerPageSectionTitles } from './const'; import SpawnerFooter from './SpawnerFooter'; @@ -38,7 +38,11 @@ import { useNotebookSize } from './useNotebookSize'; import StorageField from './storage/StorageField'; import EnvironmentVariables from './environmentVariables/EnvironmentVariables'; import { useStorageDataObject } from './storage/utils'; -import { getRootVolumeName, useMergeDefaultPVCName } from './spawnerUtils'; +import { + getCompatibleAcceleratorIdentifiers, + getRootVolumeName, + useMergeDefaultPVCName, +} from './spawnerUtils'; import { useNotebookEnvVariables } from './environmentVariables/useNotebookEnvVariables'; import DataConnectionField from './dataConnection/DataConnectionField'; import { useNotebookDataConnection } from './dataConnection/useNotebookDataConnection'; @@ -61,7 +65,7 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { imageVersion: undefined, }); const { selectedSize, setSelectedSize, sizes } = useNotebookSize(); - const [selectedGpu, setSelectedGpu] = React.useState('0'); + const [supportedAccelerators, setSupportedAccelerators] = React.useState(); const [storageDataWithoutDefault, setStorageData] = useStorageDataObject(existingNotebook); const storageData = useMergeDefaultPVCName(storageDataWithoutDefault, nameDesc.name); const [envVariables, setEnvVariables] = useNotebookEnvVariables(existingNotebook); @@ -97,10 +101,16 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { } }, [notebookSize, setSelectedSize]); - const notebookGPU = useNotebookGPUNumber(existingNotebook); + const [notebookAcceleratorState, setNotebookAcceleratorState] = + useNotebookAccelerator(existingNotebook); + React.useEffect(() => { - setSelectedGpu(notebookGPU.toString()); - }, [notebookGPU, setSelectedGpu]); + if (selectedImage.imageStream) { + setSupportedAccelerators(getCompatibleAcceleratorIdentifiers(selectedImage.imageStream)); + } else { + setSupportedAccelerators(undefined); + } + }, [selectedImage.imageStream]); const editNotebookDisplayName = existingNotebook ? getNotebookDisplayName(existingNotebook) : ''; @@ -162,6 +172,7 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { = ({ existingNotebook }) => { setValue={setSelectedSize} value={selectedSize} /> - setSelectedGpu(value)} + + setNotebookAcceleratorState('accelerator', accelerator) + } + acceleratorCount={notebookAcceleratorState.count} + setAcceleratorCount={(acceleratorCount) => + setNotebookAcceleratorState('count', acceleratorCount) + } + supportedAccelerators={supportedAccelerators} /> = ({ existingNotebook }) => { projectName: currentProject.metadata.name, image: selectedImage, notebookSize: selectedSize, - gpus: parseInt(selectedGpu), + accelerator: notebookAcceleratorState, volumes: [], volumeMounts: [], }} diff --git a/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx b/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx index e7f6d6bf35..883908472b 100644 --- a/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx +++ b/frontend/src/pages/projects/screens/spawner/imageSelector/ImageSelectorField.tsx @@ -17,11 +17,13 @@ import ImageStreamSelector from './ImageStreamSelector'; type ImageSelectorFieldProps = { selectedImage: ImageStreamAndVersion; setSelectedImage: React.Dispatch>; + compatibleAccelerator?: string; }; const ImageSelectorField: React.FC = ({ selectedImage, setSelectedImage, + compatibleAccelerator, }) => { const { dashboardNamespace } = useDashboardNamespace(); const buildStatuses = useBuildStatuses(dashboardNamespace); @@ -69,6 +71,7 @@ const ImageSelectorField: React.FC = ({ buildStatuses={buildStatuses} onImageStreamSelect={onImageStreamSelect} selectedImageStream={selectedImage.imageStream} + compatibleAccelerator={compatibleAccelerator} /> void; + compatibleAccelerator?: string; }; const ImageStreamSelector: React.FC = ({ @@ -22,49 +24,47 @@ const ImageStreamSelector: React.FC = ({ selectedImageStream, onImageStreamSelect, buildStatuses, + compatibleAccelerator, }) => { - const [imageSelectionOpen, setImageSelectionOpen] = React.useState(false); - - const selectOptionObjects = [...imageStreams] - .sort(compareImageStreamOrder) - .map((imageStream) => getImageStreamSelectOptionObject(imageStream)); - - const options = selectOptionObjects.map((optionObject) => { - const imageStream = optionObject.imageStream; + const options = [...imageStreams].sort(compareImageStreamOrder).map((imageStream) => { const description = getRelatedVersionDescription(imageStream); - return ( - - ); + const displayName = getImageStreamDisplayName(imageStream); + + return { + key: imageStream.metadata.name, + selectedLabel: displayName, + description: description, + disabled: !checkImageStreamAvailability(imageStream, buildStatuses), + label: ( + + {displayName} + + + {isCompatibleWithAccelerator(compatibleAccelerator, imageStream) && ( + + )} + + + ), + }; }); return ( - + /> ); }; diff --git a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts index dddafdac94..1fcef275b0 100644 --- a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts +++ b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts @@ -1,8 +1,13 @@ import * as React from 'react'; import compareVersions from 'compare-versions'; -import { K8sResourceCommon } from '@openshift/dynamic-plugin-sdk-utils'; -import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types'; -import { BuildKind, ImageStreamKind, ImageStreamSpecTagType, NotebookKind } from '~/k8sTypes'; +import { BYONImage, K8sResourceCommon, NotebookSize, Volume, VolumeMount } from '~/types'; +import { + BuildKind, + ImageStreamKind, + ImageStreamSpecTagType, + K8sDSGResource, + NotebookKind, +} from '~/k8sTypes'; import { ConfigMapCategory, DataConnectionData, @@ -17,7 +22,6 @@ import { ROOT_MOUNT_PATH } from '~/pages/projects/pvc/const'; import { AWS_FIELDS } from '~/pages/projects/dataConnections/const'; import { BuildStatus, - ImageStreamSelectOptionObjectType, ImageVersionDependencyType, ImageVersionSelectOptionObjectType, } from './types'; @@ -65,12 +69,6 @@ export const getNameVersionString = (software: ImageVersionDependencyType): stri * Create object for PF Select component to use * `toString` decides the text shown for the select option */ -export const getImageStreamSelectOptionObject = ( - imageStream: ImageStreamKind, -): ImageStreamSelectOptionObjectType => ({ - imageStream, - toString: () => getImageStreamDisplayName(imageStream), -}); export const getImageVersionSelectOptionObject = ( imageStream: ImageStreamKind, imageVersion: ImageStreamSpecTagType, @@ -79,15 +77,10 @@ export const getImageVersionSelectOptionObject = ( toString: () => `${imageVersion.name}${checkVersionRecommended(imageVersion) ? ' (Recommended)' : ''}`, }); -export const isImageStreamSelectOptionObject = ( - object: unknown, -): object is ImageStreamSelectOptionObjectType => - (object as ImageStreamSelectOptionObjectType).imageStream !== undefined; export const isImageVersionSelectOptionObject = ( object: unknown, ): object is ImageVersionSelectOptionObjectType => (object as ImageVersionSelectOptionObjectType).imageVersion !== undefined; - /******************* Compare utils for sorting *******************/ const getBuildNumber = (build: BuildKind): number => { const buildNumber = build.metadata.annotations?.['openshift.io/build.number'] || '-1'; @@ -141,6 +134,37 @@ export const getImageStreamDescription = (imageStream: ImageStreamKind): string export const getImageSteamOrder = (imageStream: ImageStreamKind): number => parseInt(imageStream.metadata.annotations?.[IMAGE_ANNOTATIONS.IMAGE_ORDER] || '100'); +export const getCompatibleAcceleratorIdentifiers = ( + object: ImageStreamKind | K8sDSGResource, +): string[] => { + try { + const annotation = object.metadata.annotations?.['opendatahub.io/recommended-accelerators']; + // in the format of ["foo.com/gpu", "bar.com/gpu"] + if (annotation) { + const identifiers = JSON.parse(annotation); + if (Array.isArray(identifiers)) { + return identifiers; + } + } + } catch (error) { + // catch invalid json in metadata + } + return []; +}; + +export const isCompatibleWithAccelerator = ( + acceleratorIdentifier?: string, + obj?: ImageStreamKind | K8sDSGResource, +) => { + if (!obj || !acceleratorIdentifier) { + return false; + } + + return getCompatibleAcceleratorIdentifiers(obj).some( + (accelerator) => accelerator === acceleratorIdentifier, + ); +}; + /** * Parse annotation software field or dependencies field from long string to array */ diff --git a/frontend/src/pages/projects/screens/spawner/types.ts b/frontend/src/pages/projects/screens/spawner/types.ts index 9f4f8c6bc3..3aec64e612 100644 --- a/frontend/src/pages/projects/screens/spawner/types.ts +++ b/frontend/src/pages/projects/screens/spawner/types.ts @@ -1,4 +1,4 @@ -import { BUILD_PHASE, ImageStreamKind, ImageStreamSpecTagType } from '~/k8sTypes'; +import { AcceleratorKind, BUILD_PHASE, ImageStreamKind, ImageStreamSpecTagType } from '~/k8sTypes'; export enum SpawnerPageSectionID { NAME_DESCRIPTION = 'name-and-description', @@ -40,3 +40,8 @@ export type ImageVersionSelectDataType = { imageStream?: ImageStreamKind; imageVersions: ImageStreamSpecTagType[]; }; + +export type AcceleratorSelectOptionObjectType = { + accelerator: AcceleratorKind; + toString: () => string; +}; diff --git a/frontend/src/pages/projects/types.ts b/frontend/src/pages/projects/types.ts index 89de7934e3..abcf01cd16 100644 --- a/frontend/src/pages/projects/types.ts +++ b/frontend/src/pages/projects/types.ts @@ -8,6 +8,7 @@ import { import { ValueOf } from '~/typeHelpers'; import { AWSSecretKind } from '~/k8sTypes'; import { AWS_KEYS } from './dataConnections/const'; +import { AcceleratorState } from './screens/detail/notebooks/useNotebookAccelerator'; export type UpdateObjectAtPropAndValue = (propKey: keyof T, propValue: ValueOf) => void; @@ -60,7 +61,7 @@ export type StartNotebookData = { projectName: string; notebookName: string; notebookSize: NotebookSize; - gpus: number; + accelerator: AcceleratorState; image: ImageStreamAndVersion; volumes?: Volume[]; volumeMounts?: VolumeMount[]; diff --git a/frontend/src/services/acceleratorService.ts b/frontend/src/services/acceleratorService.ts new file mode 100644 index 0000000000..7e7908dbb9 --- /dev/null +++ b/frontend/src/services/acceleratorService.ts @@ -0,0 +1,12 @@ +import axios from 'axios'; +import { AcceleratorInfo } from '~/types'; + +export const getAcceleratorCounts = (): Promise => { + const url = '/api/accelerators'; + return axios + .get(url) + .then((response) => response.data) + .catch((e) => { + throw new Error(e.response.data.message); + }); +}; diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 4cfc66ad41..4f3bddcb5e 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -6,6 +6,7 @@ import { ServingRuntimeSize } from '~/pages/modelServing/screens/types'; import { EnvironmentFromVariable } from '~/pages/projects/types'; import { ImageStreamKind, ImageStreamSpecTagType } from './k8sTypes'; import { EitherNotBoth } from './typeHelpers'; +import { AcceleratorState } from './pages/projects/screens/detail/notebooks/useNotebookAccelerator'; export type PrometheusQueryResponse = { data: { @@ -103,24 +104,22 @@ export type NotebookControllerUserState = { * OdhDashboardConfig contains gpuSetting as a string value override -- proper gpus return as numbers * TODO: Look to make it just number by properly parsing the value */ -export type GPUCount = string | number; export enum ContainerResourceAttributes { CPU = 'cpu', MEMORY = 'memory', - NVIDIA_GPU = 'nvidia.com/gpu', } export type ContainerResources = { requests?: { + [key: string]: number | string | undefined; cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; limits?: { + [key: string]: number | string | undefined; cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; }; @@ -329,7 +328,8 @@ export type TrackingEventProperties = { anonymousID?: string; type?: string; term?: string; - GPU?: GPUCount; + accelerator?: string; + acceleratorCount?: number; lastSelectedSize?: string; lastSelectedImage?: string; projectName?: string; @@ -344,9 +344,11 @@ export type NotebookPort = { }; export type PodToleration = { - effect: string; key: string; - operator: string; + operator?: string; + value?: string; + effect?: string; + tolerationSeconds?: number; }; export type NotebookContainer = { @@ -376,6 +378,7 @@ export type Notebook = K8sResourceCommon & { 'opendatahub.io/username': string; // the untranslated username behind the notebook 'notebooks.opendatahub.io/last-image-selection': string; // the last image they selected 'notebooks.opendatahub.io/last-size-selection': string; // the last notebook size they selected + 'opendatahub.io/accelerator-name': string | undefined; }>; labels: Partial<{ 'opendatahub.io/user': string; // translated username -- see translateUsername @@ -702,7 +705,7 @@ export type NotebookData = { notebookSizeName: string; imageName: string; imageTagName: string; - gpus: number; + accelerator: AcceleratorState; envVars: EnvVarReducedTypeKeyValues; state: NotebookState; // only used for admin calls, regular users cannot use this field @@ -737,3 +740,10 @@ export type ContextResourceData = { export type BreadcrumbItemType = { label: string; } & EitherNotBoth<{ link: string }, { isActive: boolean }>; + +export type AcceleratorInfo = { + configured: boolean; + available: { [key: string]: number }; + total: { [key: string]: number }; + allocated: { [key: string]: number }; +}; diff --git a/frontend/src/utilities/imageUtils.ts b/frontend/src/utilities/imageUtils.ts index 5870c3ac83..1177272d2e 100644 --- a/frontend/src/utilities/imageUtils.ts +++ b/frontend/src/utilities/imageUtils.ts @@ -2,13 +2,11 @@ import compareVersions from 'compare-versions'; import { BuildStatus, BUILD_PHASE, - GPUCount, ImageInfo, ImageSoftwareType, ImageTag, ImageTagInfo, NotebookContainer, - ContainerResourceAttributes, } from '~/types'; const PENDING_PHASES = [ @@ -72,9 +70,6 @@ export const getVersion = (version?: string, prefix?: string): string => { export const getNameVersionString = (software: ImageSoftwareType): string => `${software.name}${getVersion(software.version, ' v')}`; -export const getNumGpus = (container?: NotebookContainer): GPUCount => - container?.resources?.limits?.[ContainerResourceAttributes.NVIDIA_GPU] || 0; - export const getDefaultTag = ( buildStatuses: BuildStatus[], image: ImageInfo, diff --git a/frontend/src/utilities/tolerations.ts b/frontend/src/utilities/tolerations.ts index aa3e17480d..f381e9fd5a 100644 --- a/frontend/src/utilities/tolerations.ts +++ b/frontend/src/utilities/tolerations.ts @@ -1,6 +1,6 @@ import { Patch } from '@openshift/dynamic-plugin-sdk-utils'; import { DashboardConfig, PodToleration, TolerationSettings } from '~/types'; -import { NotebookKind } from '~/k8sTypes'; +import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; export type TolerationChanges = { type: 'add' | 'remove' | 'replace' | 'nothing'; @@ -8,17 +8,13 @@ export type TolerationChanges = { }; export const determineTolerations = ( - hasGpu: boolean, tolerationSettings?: TolerationSettings, + accelerator?: AcceleratorKind, ): PodToleration[] => { const tolerations: PodToleration[] = []; - if (hasGpu) { - tolerations.push({ - effect: 'NoSchedule', - key: 'nvidia.com/gpu', - operator: 'Exists', - }); + if (accelerator?.spec.tolerations) { + tolerations.push(...accelerator.spec.tolerations); } if (tolerationSettings?.enabled) { tolerations.push({ @@ -35,15 +31,9 @@ export const computeNotebooksTolerations = ( dashboardConfig: DashboardConfig, notebook: NotebookKind, ): TolerationChanges => { - const hasGPU = !!notebook.spec.template.spec.containers.find( - (container) => - !!container.resources?.limits?.['nvidia.com/gpu'] || - !!container.resources?.requests?.['nvidia.com/gpu'], - ); const tolerations = notebook.spec.template.spec.tolerations || []; const settings = determineTolerations( - hasGPU, dashboardConfig.spec.notebookController?.notebookTolerationSettings, ); diff --git a/frontend/src/utilities/useGenericObjectState.ts b/frontend/src/utilities/useGenericObjectState.ts index 0f535c5d15..ae2808256d 100644 --- a/frontend/src/utilities/useGenericObjectState.ts +++ b/frontend/src/utilities/useGenericObjectState.ts @@ -1,7 +1,7 @@ import * as React from 'react'; import { UpdateObjectAtPropAndValue } from '~/pages/projects/types'; -type GenericObjectState = [ +export type GenericObjectState = [ data: T, setData: UpdateObjectAtPropAndValue, resetDefault: () => void, diff --git a/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml b/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml deleted file mode 100644 index 3d34c5830b..0000000000 --- a/manifests/crd/acceleratorprofile.opendatahub.io.crd.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: acceleratorprofiles.opendatahub.io -spec: - group: opendatahub.io - scope: Namespaced - names: - plural: acceleratorprofiles - singular: acceleratorprofile - kind: AcceleratorProfile - versions: - - name: v1alpha - served: true - storage: true - schema: - openAPIV3Schema: - type: object - required: - - spec - properties: - spec: - type: object - required: - - display-name - - enabled - - identifier - properties: - display-name: - type: string - enabled: - type: boolean - identifier: - type: string - description: - type: string - modifiedDate: - type: string - \ No newline at end of file diff --git a/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml index 3d34c5830b..0b429de43d 100644 --- a/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml +++ b/manifests/crd/acceleratorprofiles.opendatahub.io.crd.yaml @@ -1,9 +1,9 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: - name: acceleratorprofiles.opendatahub.io + name: acceleratorprofiles.dashboard.opendatahub.io spec: - group: opendatahub.io + group: dashboard.opendatahub.io scope: Namespaced names: plural: acceleratorprofiles @@ -22,11 +22,11 @@ spec: spec: type: object required: - - display-name + - displayName - enabled - identifier properties: - display-name: + displayName: type: string enabled: type: boolean @@ -34,6 +34,21 @@ spec: type: string description: type: string - modifiedDate: - type: string + tolerations: + type: array + items: + type: object + required: + - key + properties: + key: + type: string + operator: + type: string + value: + type: string + effect: + type: string + tolerationSeconds: + type: integer \ No newline at end of file From 34a2f1ce0f6300cf355cc89e1a0842ba2364a1c1 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Mon, 7 Aug 2023 09:56:09 -0500 Subject: [PATCH 10/22] sqush --- .../api/accelerators/acceleratorUtils.ts | 46 +++++++++++++++++++ backend/src/routes/api/accelerators/index.ts | 11 +++++ backend/src/routes/api/gpu/gpuUtils.ts | 2 +- backend/src/routes/api/gpu/index.ts | 3 ++ backend/src/types.ts | 8 ++++ 5 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 backend/src/routes/api/accelerators/acceleratorUtils.ts create mode 100644 backend/src/routes/api/accelerators/index.ts diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts new file mode 100644 index 0000000000..09ae2ddba6 --- /dev/null +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -0,0 +1,46 @@ +import { AcceleratorInfo, KubeFastifyInstance } from "../../../types" + +const RESOURCE_TYPES = ["cpu", "memory", "pods", "ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "attachable-volumes-aws-ebs"] + +const getIdentifiersFromResources = (resources: {[key: string]: string} = {}) => { + return Object.entries(resources) + .filter(([key,]) => !RESOURCE_TYPES.includes(key)) + .reduce<{[key: string]: number}>((identifiers, [key, value]) => { + identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value) + return identifiers + }, {}) +} + +export const getAcceleratorNumbers = async (fastify: KubeFastifyInstance): Promise => ( + fastify.kube.coreV1Api.listNode() + .then((res) => res.body.items.reduce((info, node) => { + // reduce resources down to just the accelerators and their counts + const allocatable = getIdentifiersFromResources(node.status.allocatable) + const capacity = getIdentifiersFromResources(node.status.capacity) + + // update the max count for each accelerator + Object.entries(allocatable).forEach(([key, value]) => ( + info.available[key] = Math.max((info.available[key] || 0), value) + )) + + // update the total count for each accelerator + Object.entries(capacity).forEach(([key, value]) => ( + info.total[key] = (info.total[key] || 0) + value + )) + + + // update the allocated count for each accelerator + Object.entries(capacity).forEach(([key, value]) => ( + info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0) + )) + + // if any accelerators are available, the cluster is configured + const configured = info.configured || Object.values(info.available).some((value) => value > 0) + + return {total: info.total, available: info.available, allocated: info.allocated, configured} + }, {configured: false, available: {}, total: {}, allocated: {}})) + .catch((e) => { + fastify.log.error(`Exception when listing cluster nodes: ${e}`); + return {configured: false, available: {}, total: {}, allocated: {}} + }) +) diff --git a/backend/src/routes/api/accelerators/index.ts b/backend/src/routes/api/accelerators/index.ts new file mode 100644 index 0000000000..16d651ad6d --- /dev/null +++ b/backend/src/routes/api/accelerators/index.ts @@ -0,0 +1,11 @@ +import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types'; +import { getAcceleratorNumbers } from './acceleratorUtils'; +import { logRequestDetails } from '../../../utils/fileUtils'; + +export default async (fastify: KubeFastifyInstance): Promise => { + fastify.get('/', async (request: OauthFastifyRequest) => { + logRequestDetails(fastify, request); + + return getAcceleratorNumbers(fastify); + }); +}; diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts index 9755a21d54..c3726436fd 100644 --- a/backend/src/routes/api/gpu/gpuUtils.ts +++ b/backend/src/routes/api/gpu/gpuUtils.ts @@ -17,7 +17,7 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = { }; /** - * @deprecated + * @deprecated - use getAcceleratorNumbers instead */ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise => { if (storage.lastFetch >= Date.now() - 30_000) { diff --git a/backend/src/routes/api/gpu/index.ts b/backend/src/routes/api/gpu/index.ts index 5d91bb04c3..dc7068851c 100644 --- a/backend/src/routes/api/gpu/index.ts +++ b/backend/src/routes/api/gpu/index.ts @@ -2,6 +2,9 @@ import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types'; import { getGPUNumber } from './gpuUtils'; import { logRequestDetails } from '../../../utils/fileUtils'; +/** + * @deprecated - use accelerators instead + */ export default async (fastify: KubeFastifyInstance): Promise => { fastify.get('/', async (request: OauthFastifyRequest) => { logRequestDetails(fastify, request); diff --git a/backend/src/types.ts b/backend/src/types.ts index 3c85ad8afa..57ae0bd50e 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -748,6 +748,14 @@ export type GPUInfo = { available: number; autoscalers: gpuScale[]; }; + +export type AcceleratorInfo = { + configured: boolean; + available: {[key: string]: number}; + total: {[key: string]: number}; + allocated: {[key: string]: number}; +} + export type EnvironmentVariable = EitherNotBoth< { value: string | number }, { valueFrom: Record } From 3694f9e7270a0574396d0aac4207d76bc7376200 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Mon, 7 Aug 2023 09:58:15 -0500 Subject: [PATCH 11/22] Squashed commit of the following: commit 26da28943967a71c31b53418866eda37041861ae Author: Gage Krumbach Date: Tue Aug 1 16:40:25 2023 -0500 fix error state in migration commit 391cbca5d1a807281cef3bd5d6f5269570cc6011 Author: Gage Krumbach Date: Tue Aug 1 15:09:25 2023 -0500 added accelerator detection line commit 50839ac5ff344f2760655f3604bfb5774ecb017b Author: Gage Krumbach Date: Thu Jul 27 13:52:24 2023 -0500 added gpu migration --- backend/src/plugins/kube.ts | 13 ++-- backend/src/types.ts | 16 +++++ backend/src/utils/resourceUtils.ts | 108 +++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts index ee1d78c87f..232651aa5f 100644 --- a/backend/src/plugins/kube.ts +++ b/backend/src/plugins/kube.ts @@ -4,7 +4,7 @@ import { FastifyInstance } from 'fastify'; import * as jsYaml from 'js-yaml'; import * as k8s from '@kubernetes/client-node'; import { DEV_MODE } from '../utils/constants'; -import { cleanupDSPSuffix, initializeWatchedResources } from '../utils/resourceUtils'; +import { cleanupGPU, initializeWatchedResources } from '../utils/resourceUtils'; import { User } from '@kubernetes/client-node/dist/config_types'; const CONSOLE_CONFIG_YAML_FIELD = 'console-config.yaml'; @@ -82,17 +82,16 @@ export default fp(async (fastify: FastifyInstance) => { // Initialize the watching of resources initializeWatchedResources(fastify); - // TODO: Delete this code in the future once we have no customers using RHODS 1.19 / ODH 2.4.0 - // Cleanup for display name suffix of [DSP] - cleanupDSPSuffix(fastify).catch((e) => + cleanupGPU(fastify).catch((e) => fastify.log.error( - `Unable to fully cleanup project display name suffixes - Some projects may not appear in the dashboard UI. ${ + `Unable to fully convert GPU to use accelerator profiles. ${ e.response?.body?.message || e.message }`, - ), - ); + ) + ) }); + const getCurrentNamespace = async () => { return new Promise((resolve, reject) => { if (currentContext === 'inClusterContext') { diff --git a/backend/src/types.ts b/backend/src/types.ts index 57ae0bd50e..fecdc53fff 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -901,4 +901,20 @@ export type ServingRuntime = K8sResourceCommon & { supportedModelFormats: SupportedModelFormats[]; replicas: number; }; +}; + +export type AcceleratorKind = K8sResourceCommon & { + metadata: { + name: string; + annotations?: Partial<{ + 'opendatahub.io/modified-date': string; + }>; + }; + spec: { + displayName: string; + enabled: boolean; + identifier: string; + description?: string; + tolerations?: NotebookToleration[]; + }; }; \ No newline at end of file diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index ee59a38b2e..b5d83ac450 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -2,6 +2,7 @@ import * as _ from 'lodash'; import createError from 'http-errors'; import { PatchUtils, V1ConfigMap, V1Namespace, V1NamespaceList } from '@kubernetes/client-node'; import { + AcceleratorKind, BUILD_PHASE, BuildKind, BuildStatus, @@ -31,6 +32,7 @@ import { getRouteForClusterId, } from './componentUtils'; import { createCustomError } from './requestUtils'; +import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils'; const dashboardConfigMapName = 'odh-dashboard-config'; const consoleLinksGroup = 'console.openshift.io'; @@ -606,6 +608,112 @@ export const getConsoleLinks = (): ConsoleLinkKind[] => { return consoleLinksWatcher.getResources(); }; +/** + * Converts GPU usage to use accelerator by adding an accelerator profile CRD to the cluster if GPU usage is detected + */ +export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => { + // When we startup — in kube.ts we can handle a migration (catch ALL promise errors — exit gracefully and use fastify logging) + // Check for migration-gpu-status configmap in dashboard namespace — if found, exit early + const CONFIG_MAP_NAME = 'migration-gpu-status'; + + const continueProcessing = await fastify.kube.coreV1Api + .readNamespacedConfigMap(CONFIG_MAP_NAME, fastify.kube.namespace) + .then(() => { + // Found configmap, not continuing + return false; + }) + .catch((e) => { + if (e.statusCode === 404) { + // No config saying we have already migrated gpus, continue + return true; + } + }); + + + if (continueProcessing) { + // Read existing AcceleratorProfiles + const acceleratorProfilesResponse = await fastify.kube.customObjectsApi + .listNamespacedCustomObject( + 'dashboard.opendatahub.io', + 'v1alpha', + fastify.kube.namespace, + 'acceleratorprofiles' + ).catch((e) => { + // If 404 shows up — CRD may not be installed, exit early + throw 'Unable to fetch accelerator profiles: ' + e.toString() + }); + + const acceleratorProfiles = ( + acceleratorProfilesResponse?.body as { + items: AcceleratorKind[] + } + )?.items; + + // If not error and no profiles detected: + if (acceleratorProfiles && Array.isArray(acceleratorProfiles) && acceleratorProfiles.length === 0) { + // if gpu detected on cluster, create our default migrated-gpu + // TODO GPU detection + const acceleratorDetected = await getAcceleratorNumbers(fastify) + + if (acceleratorDetected.configured) { + const payload: AcceleratorKind = { + kind: 'AcceleratorProfile', + apiVersion: 'dashboard.opendatahub.io/v1alpha', + metadata: { + name: 'migrated-gpu', + namespace: fastify.kube.namespace, + }, + spec: { + displayName: 'Nvidia GPU', + identifier: 'nvidia.com/gpu', + enabled: true, + tolerations: [ + { + effect: 'NoSchedule', + key: 'nvidia.com/gpu', + operator: 'Exists', + } + ] + }, + }; + + try { + await await fastify.kube.customObjectsApi.createNamespacedCustomObject( + 'dashboard.opendatahub.io', + 'v1alpha', + fastify.kube.namespace, + 'acceleratorprofiles', + payload + ) + } catch (e) { + // If bad detection — exit early and dont create config + throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString() + } + }; + } + + // Create configmap to flag operation as successful + const configMap = { + metadata: { + name: CONFIG_MAP_NAME, + namespace: fastify.kube.namespace, + }, + data: { + migratedCompleted: 'true', + }, + } + + await fastify.kube.coreV1Api + .createNamespacedConfigMap(fastify.kube.namespace, configMap) + .then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles')) + .catch((e) => { + throw createCustomError( + 'Unable to create gpu migration configmap', + e.response?.body?.message || e.message, + ); + }); + } +} /** * @deprecated - Look to remove asap (see comments below) * Converts namespaces that have a display-name annotation suffixed with `[DSP]` over to using a label. From e5717c3be302dbbe19fb87309e09c2bdddb233d9 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 8 Aug 2023 12:36:05 -0500 Subject: [PATCH 12/22] bug fixes --- backend/src/types.ts | 25 +++++++-- backend/src/utils/constants.ts | 4 +- backend/src/utils/notebookUtils.ts | 52 +++++++++---------- backend/src/utils/resourceUtils.ts | 32 +++++++++++- frontend/src/api/k8s/utils.ts | 2 +- .../screens/server/AcceleratorSelectField.tsx | 4 +- .../projects/screens/spawner/spawnerUtils.ts | 4 +- 7 files changed, 81 insertions(+), 42 deletions(-) diff --git a/backend/src/types.ts b/backend/src/types.ts index fecdc53fff..1674dccf37 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -247,6 +247,7 @@ export type KubeDecorator = KubeStatus & { customObjectsApi: k8s.CustomObjectsApi; rbac: k8s.RbacAuthorizationV1Api; currentToken: string; + }; export type KubeFastifyInstance = FastifyInstance & { @@ -806,12 +807,17 @@ export type NotebookData = { notebookSizeName: string; imageName: string; imageTagName: string; - gpus: number; + accelerator: AcceleratorState; envVars: EnvVarReducedTypeKeyValues; state: NotebookState; username?: string; }; +export type AcceleratorState = { + accelerator?: AcceleratorKind; + count: number; +}; + export const LIMIT_NOTEBOOK_IMAGE_GPU = 'nvidia.com/gpu'; type DisplayNameAnnotations = Partial<{ @@ -863,18 +869,20 @@ export type SupportedModelFormats = { autoSelect?: boolean; }; -export type GPUCount = string | number; + +export enum ContainerResourceAttributes { + CPU = 'cpu', + MEMORY = 'memory', +} export type ContainerResources = { requests?: { cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; limits?: { cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; }; @@ -917,4 +925,11 @@ export type AcceleratorKind = K8sResourceCommon & { description?: string; tolerations?: NotebookToleration[]; }; -}; \ No newline at end of file +}; + +export enum KnownLabels { + DASHBOARD_RESOURCE = 'opendatahub.io/dashboard', + PROJECT_SHARING = 'opendatahub.io/project-sharing', + MODEL_SERVING_PROJECT = 'modelmesh-enabled', + DATA_CONNECTION_AWS = 'opendatahub.io/managed', +} \ No newline at end of file diff --git a/backend/src/utils/constants.ts b/backend/src/utils/constants.ts index 2ac758bde0..b69bae89d2 100644 --- a/backend/src/utils/constants.ts +++ b/backend/src/utils/constants.ts @@ -1,6 +1,6 @@ import * as path from 'path'; import './dotenv'; -import { DashboardConfig, NotebookSize } from '../types'; +import { DashboardConfig, KnownLabels, NotebookSize } from '../types'; export const PORT = Number(process.env.PORT) || Number(process.env.BACKEND_PORT) || 8080; export const IP = process.env.IP || '0.0.0.0'; @@ -133,3 +133,5 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [ export const imageUrlRegex = /^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/; + + export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`; diff --git a/backend/src/utils/notebookUtils.ts b/backend/src/utils/notebookUtils.ts index 32512311bd..f87bceddf9 100644 --- a/backend/src/utils/notebookUtils.ts +++ b/backend/src/utils/notebookUtils.ts @@ -1,10 +1,10 @@ import { getDashboardConfig } from './resourceUtils'; import { + ContainerResourceAttributes, EnvironmentVariable, ImageInfo, ImageTag, KubeFastifyInstance, - LIMIT_NOTEBOOK_IMAGE_GPU, Notebook, NotebookAffinity, NotebookData, @@ -156,7 +156,7 @@ export const assembleNotebook = async ( envName: string, tolerationSettings: NotebookTolerationSettings, ): Promise => { - const { notebookSizeName, imageName, imageTagName, gpus, envVars } = data; + const { notebookSizeName, imageName, imageTagName, accelerator, envVars } = data; const notebookSize = getNotebookSize(notebookSizeName); @@ -186,39 +186,34 @@ export const assembleNotebook = async ( const tolerations: NotebookToleration[] = []; let affinity: NotebookAffinity = {}; - if (gpus > 0) { + if (accelerator.count > 0 && accelerator.accelerator) { if (!resources.limits) { resources.limits = {}; } if (!resources.requests) { resources.requests = {}; } - resources.limits[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus; - resources.requests[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus; - tolerations.push({ - effect: 'NoSchedule', - key: LIMIT_NOTEBOOK_IMAGE_GPU, - operator: 'Exists', - }); + resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; + resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { - affinity = { - nodeAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - { - preference: { - matchExpressions: [ - { - key: 'nvidia.com/gpu.present', - operator: 'NotIn', - values: ['true'], - }, - ], - }, - weight: 1, - }, - ], - }, - }; + // step type down to string to avoid type errors + const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes); + + Object.keys(resources.limits || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.limits?.[key]; + } + }); + + Object.keys(resources.requests || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.requests?.[key]; + } + }); + } + + if (accelerator.accelerator?.spec.tolerations) { + tolerations.push(...accelerator.accelerator.spec.tolerations); } if (tolerationSettings?.enabled) { @@ -266,6 +261,7 @@ export const assembleNotebook = async ( 'notebooks.opendatahub.io/last-image-selection': imageSelection, 'opendatahub.io/username': username, 'kubeflow-resource-stopped': null, + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }, name: name, namespace: namespace, diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index b5d83ac450..40101f1e82 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -33,6 +33,7 @@ import { } from './componentUtils'; import { createCustomError } from './requestUtils'; import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils'; +import { getNotebooks } from './notebookUtils'; const dashboardConfigMapName = 'odh-dashboard-config'; const consoleLinksGroup = 'console.openshift.io'; @@ -678,7 +679,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => }; try { - await await fastify.kube.customObjectsApi.createNamespacedCustomObject( + await fastify.kube.customObjectsApi.createNamespacedCustomObject( 'dashboard.opendatahub.io', 'v1alpha', fastify.kube.namespace, @@ -688,7 +689,34 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => } catch (e) { // If bad detection — exit early and dont create config throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString() - } + } + + // update already running notebooks to use the new profile + const notebooks = await getNotebooks(fastify, fastify.kube.namespace) + notebooks.items.forEach(async (notebook) => { + const gpuCount = notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu'] + if (gpuCount) { + notebook.metadata.annotations = { + ...notebook.metadata.annotations, + 'opendatahub.io/recommended-accelerators' : 'migrated-gpu' + } + await fastify.kube.customObjectsApi.patchNamespacedCustomObject( + 'kubeflow.org', + 'v1', + fastify.kube.namespace, + 'notebooks', + notebook.metadata.name, + notebook, + undefined, + undefined, + undefined, + { + headers: { 'Content-type': PatchUtils.PATCH_FORMAT_JSON_MERGE_PATCH }, + }, + ) + } + } + ) }; } diff --git a/frontend/src/api/k8s/utils.ts b/frontend/src/api/k8s/utils.ts index 883df66e5c..0b9498749f 100644 --- a/frontend/src/api/k8s/utils.ts +++ b/frontend/src/api/k8s/utils.ts @@ -31,7 +31,7 @@ export const assemblePodSpecOptions = ( resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { // step type down to string to avoid type errors - const containerResourceKeys: string[] = Object.keys(ContainerResourceAttributes); + const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes); Object.keys(resources.limits || {}).forEach((key) => { if (!containerResourceKeys.includes(key)) { diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx index dcb0d45f62..23b3c2c4cf 100644 --- a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -61,9 +61,7 @@ const AcceleratorSelectField: React.FC = ({ ); React.useEffect(() => { - if (acceleratorCount > 0) { - setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); - } + setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); }, [acceleratorCount, validateAcceleratorCount]); const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState( diff --git a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts index 1fcef275b0..57f3bbc048 100644 --- a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts +++ b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts @@ -1,6 +1,6 @@ import * as React from 'react'; import compareVersions from 'compare-versions'; -import { BYONImage, K8sResourceCommon, NotebookSize, Volume, VolumeMount } from '~/types'; +import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types'; import { BuildKind, ImageStreamKind, @@ -414,7 +414,7 @@ export const isInvalidBYONImageStream = (imageStream: ImageStreamKind) => { ); }; -export const convertBYONImageToK8sResource = (image: BYONImage): K8sResourceCommon => ({ +export const convertBYONImageToK8sResource = (image: BYONImage) => ({ kind: 'ImageStream', apiVersion: 'image.openshift.io/v1', metadata: { From 607fe2678447958b5672ec3f73b1163f526b0c1a Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 10 Aug 2023 07:22:29 -0500 Subject: [PATCH 13/22] update wording --- .../screens/server/AcceleratorSelectField.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx index 23b3c2c4cf..003dc0412c 100644 --- a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -48,7 +48,7 @@ const AcceleratorSelectField: React.FC = ({ )?.[1]; if (detectedAcceleratorCount === undefined) { - return `No accelerator detected with the identifier ${accelerator?.spec.identifier} detected.`; + return `No accelerator detected with the identifier ${accelerator?.spec.identifier}.`; } else if (newSize > detectedAcceleratorCount) { return `Only ${detectedAcceleratorCount} accelerator${ detectedAcceleratorCount > 1 ? 's' : '' From c8f276737629d8ae0e2257055cb75a2b459c04ee Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 10 Aug 2023 13:25:10 -0500 Subject: [PATCH 14/22] fix lint errors --- backend/src/plugins/kube.ts | 5 +- .../api/accelerators/acceleratorUtils.ts | 113 +++++++++++------- backend/src/utils/constants.ts | 2 +- backend/src/utils/notebookUtils.ts | 2 +- backend/src/utils/resourceUtils.ts | 64 +++++----- 5 files changed, 105 insertions(+), 81 deletions(-) diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts index 232651aa5f..1a16523676 100644 --- a/backend/src/plugins/kube.ts +++ b/backend/src/plugins/kube.ts @@ -87,11 +87,10 @@ export default fp(async (fastify: FastifyInstance) => { `Unable to fully convert GPU to use accelerator profiles. ${ e.response?.body?.message || e.message }`, - ) - ) + ), + ); }); - const getCurrentNamespace = async () => { return new Promise((resolve, reject) => { if (currentContext === 'inClusterContext') { diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts index 09ae2ddba6..2988450f93 100644 --- a/backend/src/routes/api/accelerators/acceleratorUtils.ts +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -1,46 +1,67 @@ -import { AcceleratorInfo, KubeFastifyInstance } from "../../../types" - -const RESOURCE_TYPES = ["cpu", "memory", "pods", "ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "attachable-volumes-aws-ebs"] - -const getIdentifiersFromResources = (resources: {[key: string]: string} = {}) => { - return Object.entries(resources) - .filter(([key,]) => !RESOURCE_TYPES.includes(key)) - .reduce<{[key: string]: number}>((identifiers, [key, value]) => { - identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value) - return identifiers - }, {}) -} - -export const getAcceleratorNumbers = async (fastify: KubeFastifyInstance): Promise => ( - fastify.kube.coreV1Api.listNode() - .then((res) => res.body.items.reduce((info, node) => { - // reduce resources down to just the accelerators and their counts - const allocatable = getIdentifiersFromResources(node.status.allocatable) - const capacity = getIdentifiersFromResources(node.status.capacity) - - // update the max count for each accelerator - Object.entries(allocatable).forEach(([key, value]) => ( - info.available[key] = Math.max((info.available[key] || 0), value) - )) - - // update the total count for each accelerator - Object.entries(capacity).forEach(([key, value]) => ( - info.total[key] = (info.total[key] || 0) + value - )) - - - // update the allocated count for each accelerator - Object.entries(capacity).forEach(([key, value]) => ( - info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0) - )) - - // if any accelerators are available, the cluster is configured - const configured = info.configured || Object.values(info.available).some((value) => value > 0) - - return {total: info.total, available: info.available, allocated: info.allocated, configured} - }, {configured: false, available: {}, total: {}, allocated: {}})) - .catch((e) => { - fastify.log.error(`Exception when listing cluster nodes: ${e}`); - return {configured: false, available: {}, total: {}, allocated: {}} - }) -) +import { AcceleratorInfo, KubeFastifyInstance } from '../../../types'; + +const RESOURCE_TYPES = [ + 'cpu', + 'memory', + 'pods', + 'ephemeral-storage', + 'hugepages-1Gi', + 'hugepages-2Mi', + 'attachable-volumes-aws-ebs', +]; + +const getIdentifiersFromResources = (resources: { [key: string]: string } = {}) => { + return Object.entries(resources) + .filter(([key]) => !RESOURCE_TYPES.includes(key)) + .reduce<{ [key: string]: number }>((identifiers, [key, value]) => { + identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value); + return identifiers; + }, {}); +}; + +export const getAcceleratorNumbers = async ( + fastify: KubeFastifyInstance, +): Promise => + fastify.kube.coreV1Api + .listNode() + .then((res) => + res.body.items.reduce( + (info, node) => { + // reduce resources down to just the accelerators and their counts + const allocatable = getIdentifiersFromResources(node.status.allocatable); + const capacity = getIdentifiersFromResources(node.status.capacity); + + // update the max count for each accelerator + Object.entries(allocatable).forEach( + ([key, value]) => (info.available[key] = Math.max(info.available[key] || 0, value)), + ); + + // update the total count for each accelerator + Object.entries(capacity).forEach( + ([key, value]) => (info.total[key] = (info.total[key] || 0) + value), + ); + + // update the allocated count for each accelerator + Object.entries(capacity).forEach( + ([key, value]) => + (info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0)), + ); + + // if any accelerators are available, the cluster is configured + const configured = + info.configured || Object.values(info.available).some((value) => value > 0); + + return { + total: info.total, + available: info.available, + allocated: info.allocated, + configured, + }; + }, + { configured: false, available: {}, total: {}, allocated: {} }, + ), + ) + .catch((e) => { + fastify.log.error(`Exception when listing cluster nodes: ${e}`); + return { configured: false, available: {}, total: {}, allocated: {} }; + }); diff --git a/backend/src/utils/constants.ts b/backend/src/utils/constants.ts index a129a3cce4..1699c0eac1 100644 --- a/backend/src/utils/constants.ts +++ b/backend/src/utils/constants.ts @@ -135,4 +135,4 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [ export const imageUrlRegex = /^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/; - export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`; +export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`; diff --git a/backend/src/utils/notebookUtils.ts b/backend/src/utils/notebookUtils.ts index 0aad05dd40..af3c9e2703 100644 --- a/backend/src/utils/notebookUtils.ts +++ b/backend/src/utils/notebookUtils.ts @@ -191,7 +191,7 @@ export const assembleNotebook = async ( const resources: NotebookResources = { ...notebookSize.resources }; const tolerations: NotebookToleration[] = []; - let affinity: NotebookAffinity = {}; + const affinity: NotebookAffinity = {}; if (accelerator.count > 0 && accelerator.accelerator) { if (!resources.limits) { resources.limits = {}; diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index e887b6cf56..308597fdf7 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -654,7 +654,6 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => return true; } }); - if (continueProcessing) { // Read existing AcceleratorProfiles @@ -663,23 +662,28 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => 'dashboard.opendatahub.io', 'v1alpha', fastify.kube.namespace, - 'acceleratorprofiles' - ).catch((e) => { + 'acceleratorprofiles', + ) + .catch((e) => { // If 404 shows up — CRD may not be installed, exit early - throw 'Unable to fetch accelerator profiles: ' + e.toString() + throw 'Unable to fetch accelerator profiles: ' + e.toString(); }); const acceleratorProfiles = ( acceleratorProfilesResponse?.body as { - items: AcceleratorKind[] + items: AcceleratorKind[]; } )?.items; // If not error and no profiles detected: - if (acceleratorProfiles && Array.isArray(acceleratorProfiles) && acceleratorProfiles.length === 0) { + if ( + acceleratorProfiles && + Array.isArray(acceleratorProfiles) && + acceleratorProfiles.length === 0 + ) { // if gpu detected on cluster, create our default migrated-gpu // TODO GPU detection - const acceleratorDetected = await getAcceleratorNumbers(fastify) + const acceleratorDetected = await getAcceleratorNumbers(fastify); if (acceleratorDetected.configured) { const payload: AcceleratorKind = { @@ -698,8 +702,8 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => effect: 'NoSchedule', key: 'nvidia.com/gpu', operator: 'Exists', - } - ] + }, + ], }, }; @@ -709,22 +713,23 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => 'v1alpha', fastify.kube.namespace, 'acceleratorprofiles', - payload - ) + payload, + ); } catch (e) { // If bad detection — exit early and dont create config - throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString() + throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString(); } // update already running notebooks to use the new profile - const notebooks = await getNotebooks(fastify, fastify.kube.namespace) + const notebooks = await getNotebooks(fastify, fastify.kube.namespace); notebooks.items.forEach(async (notebook) => { - const gpuCount = notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu'] + const gpuCount = + notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu']; if (gpuCount) { notebook.metadata.annotations = { ...notebook.metadata.annotations, - 'opendatahub.io/recommended-accelerators' : 'migrated-gpu' - } + 'opendatahub.io/recommended-accelerators': 'migrated-gpu', + }; await fastify.kube.customObjectsApi.patchNamespacedCustomObject( 'kubeflow.org', 'v1', @@ -738,11 +743,10 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => { headers: { 'Content-type': PatchUtils.PATCH_FORMAT_JSON_MERGE_PATCH }, }, - ) + ); } - } - ) - }; + }); + } } // Create configmap to flag operation as successful @@ -754,19 +758,19 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => data: { migratedCompleted: 'true', }, - } + }; await fastify.kube.coreV1Api - .createNamespacedConfigMap(fastify.kube.namespace, configMap) - .then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles')) - .catch((e) => { - throw createCustomError( - 'Unable to create gpu migration configmap', - e.response?.body?.message || e.message, - ); - }); + .createNamespacedConfigMap(fastify.kube.namespace, configMap) + .then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles')) + .catch((e) => { + throw createCustomError( + 'Unable to create gpu migration configmap', + e.response?.body?.message || e.message, + ); + }); } -} +}; /** * @deprecated - Look to remove asap (see comments below) * Converts namespaces that have a display-name annotation suffixed with `[DSP]` over to using a label. From 095c09c1621711911d2d99f0fd357dacf1d7a0cc Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Mon, 14 Aug 2023 15:23:35 -0500 Subject: [PATCH 15/22] Added support for existing settings update deployed notebooks and sr on migrate fixed error logging remove container migration Added support for "keep what i have" soft migrate nvidia gpus to profiles fix handle exisiting settings refactored hooks remove useRef simplify functions small changes to hook merge hooks together update cluster role small changes bug fixes small type fix fixed type issues --- .../api/accelerators/acceleratorUtils.ts | 4 +- backend/src/types.ts | 4 +- backend/src/utils/resourceUtils.ts | 33 +--- frontend/src/api/k8s/accelerators.ts | 5 +- frontend/src/api/k8s/notebooks.ts | 5 + frontend/src/api/k8s/servingRuntimes.ts | 72 +++++--- frontend/src/api/k8s/utils.ts | 62 ++++--- .../src/components/SimpleDropdownSelect.tsx | 16 +- frontend/src/k8sTypes.ts | 16 +- .../projects/ServingRuntimeDetails.tsx | 6 +- .../ManageServingRuntimeModal.tsx | 65 ++++--- .../ServingRuntimeSizeSection.tsx | 13 +- .../ServingRuntimeTemplateSection.tsx | 5 +- .../screens/projects/useServingAccelerator.ts | 15 ++ .../projects/useServingRuntimeAccelerator.ts | 42 ----- .../modelServing/screens/projects/utils.ts | 6 - .../src/pages/modelServing/screens/types.ts | 2 - .../screens/server/AcceleratorSelectField.tsx | 164 ++++++++++-------- .../screens/server/SpawnerPage.tsx | 6 +- .../screens/server/useAccelerators.ts | 7 +- .../notebooks/useNotebookAccelerator.ts | 51 ++---- .../projects/screens/spawner/SpawnerPage.tsx | 12 +- frontend/src/pages/projects/types.ts | 6 +- frontend/src/types.ts | 2 +- frontend/src/utilities/tolerations.ts | 34 +++- frontend/src/utilities/useAcceleratorState.ts | 131 ++++++++++++++ manifests/base/cluster-role.yaml | 7 + 27 files changed, 470 insertions(+), 321 deletions(-) create mode 100644 frontend/src/pages/modelServing/screens/projects/useServingAccelerator.ts delete mode 100644 frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts create mode 100644 frontend/src/utilities/useAcceleratorState.ts diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts index 2988450f93..9caaab478c 100644 --- a/backend/src/routes/api/accelerators/acceleratorUtils.ts +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -62,6 +62,8 @@ export const getAcceleratorNumbers = async ( ), ) .catch((e) => { - fastify.log.error(`Exception when listing cluster nodes: ${e}`); + fastify.log.error( + `Exception when listing cluster nodes: ${e.response?.body?.message || e.message || e}`, + ); return { configured: false, available: {}, total: {}, allocated: {} }; }); diff --git a/backend/src/types.ts b/backend/src/types.ts index 087473fd16..62822591f9 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -892,11 +892,11 @@ export type ContainerResources = { requests?: { cpu?: string; memory?: string; - }; + } & Record; limits?: { cpu?: string; memory?: string; - }; + } & Record; }; export type ServingRuntime = K8sResourceCommon & { diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index 308597fdf7..3da0583a9d 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -35,7 +35,6 @@ import { } from './componentUtils'; import { createCustomError } from './requestUtils'; import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils'; -import { getNotebooks } from './notebookUtils'; const dashboardConfigMapName = 'odh-dashboard-config'; const consoleLinksGroup = 'console.openshift.io'; @@ -666,7 +665,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => ) .catch((e) => { // If 404 shows up — CRD may not be installed, exit early - throw 'Unable to fetch accelerator profiles: ' + e.toString(); + throw { message: 'Unable to fetch accelerator profiles: ' + e.toString() }; }); const acceleratorProfiles = ( @@ -682,7 +681,6 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => acceleratorProfiles.length === 0 ) { // if gpu detected on cluster, create our default migrated-gpu - // TODO GPU detection const acceleratorDetected = await getAcceleratorNumbers(fastify); if (acceleratorDetected.configured) { @@ -717,35 +715,8 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => ); } catch (e) { // If bad detection — exit early and dont create config - throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString(); + throw { message: 'Unable to add migrated-gpu accelerator profile: ' + e.toString() }; } - - // update already running notebooks to use the new profile - const notebooks = await getNotebooks(fastify, fastify.kube.namespace); - notebooks.items.forEach(async (notebook) => { - const gpuCount = - notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu']; - if (gpuCount) { - notebook.metadata.annotations = { - ...notebook.metadata.annotations, - 'opendatahub.io/recommended-accelerators': 'migrated-gpu', - }; - await fastify.kube.customObjectsApi.patchNamespacedCustomObject( - 'kubeflow.org', - 'v1', - fastify.kube.namespace, - 'notebooks', - notebook.metadata.name, - notebook, - undefined, - undefined, - undefined, - { - headers: { 'Content-type': PatchUtils.PATCH_FORMAT_JSON_MERGE_PATCH }, - }, - ); - } - }); } } diff --git a/frontend/src/api/k8s/accelerators.ts b/frontend/src/api/k8s/accelerators.ts index fdd978c8f7..de5d47d1e6 100644 --- a/frontend/src/api/k8s/accelerators.ts +++ b/frontend/src/api/k8s/accelerators.ts @@ -2,7 +2,10 @@ import { k8sListResource } from '@openshift/dynamic-plugin-sdk-utils'; import { AcceleratorKind } from '~/k8sTypes'; import { AcceleratorModel } from '~/api/models'; -export const listAccelerators = async (): Promise => +export const listAccelerators = async (namespace: string): Promise => k8sListResource({ model: AcceleratorModel, + queryOptions: { + ns: namespace, + }, }).then((listResource) => listResource.items); diff --git a/frontend/src/api/k8s/notebooks.ts b/frontend/src/api/k8s/notebooks.ts index 658fb9841d..9754c198d7 100644 --- a/frontend/src/api/k8s/notebooks.ts +++ b/frontend/src/api/k8s/notebooks.ts @@ -55,6 +55,8 @@ const assembleNotebook = ( volumes: formVolumes, volumeMounts: formVolumeMounts, tolerationSettings, + existingTolerations, + existingResources, } = data; const notebookId = overrideNotebookId || translateDisplayNameForK8s(notebookName); const imageUrl = `${image.imageStream?.status?.dockerImageRepository}:${image.imageVersion?.name}`; @@ -64,6 +66,9 @@ const assembleNotebook = ( notebookSize.resources, accelerator, tolerationSettings, + existingTolerations, + undefined, + existingResources, ); const translatedUsername = usernameTranslate(username); diff --git a/frontend/src/api/k8s/servingRuntimes.ts b/frontend/src/api/k8s/servingRuntimes.ts index 22c4dd2fbd..58bf092fcd 100644 --- a/frontend/src/api/k8s/servingRuntimes.ts +++ b/frontend/src/api/k8s/servingRuntimes.ts @@ -7,12 +7,13 @@ import { k8sUpdateResource, } from '@openshift/dynamic-plugin-sdk-utils'; import { ServingRuntimeModel } from '~/api/models'; -import { K8sAPIOptions, ServingRuntimeKind } from '~/k8sTypes'; +import { K8sAPIOptions, ServingContainer, ServingRuntimeKind } from '~/k8sTypes'; import { CreatingServingRuntimeObject } from '~/pages/modelServing/screens/types'; import { ContainerResources } from '~/types'; import { getModelServingRuntimeName } from '~/pages/modelServing/utils'; import { getDisplayNameFromK8sResource, translateDisplayNameForK8s } from '~/pages/projects/utils'; import { applyK8sAPIOptions } from '~/api/apiMergeUtils'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; import { getModelServingProjects } from './projects'; import { assemblePodSpecOptions } from './utils'; @@ -22,8 +23,9 @@ const assembleServingRuntime = ( servingRuntime: ServingRuntimeKind, isCustomServingRuntimesEnabled: boolean, isEditing?: boolean, + acceleratorState?: AcceleratorState, ): ServingRuntimeKind => { - const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth, accelerator } = data; + const { name: displayName, numReplicas, modelSize, externalRoute, tokenAuth } = data; const createName = isCustomServingRuntimesEnabled ? translateDisplayNameForK8s(displayName) : getModelServingRuntimeName(namespace); @@ -50,7 +52,7 @@ const assembleServingRuntime = ( }), ...(isCustomServingRuntimesEnabled && { 'opendatahub.io/template-display-name': getDisplayNameFromK8sResource(servingRuntime), - 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', + 'opendatahub.io/accelerator-name': acceleratorState?.accelerator?.metadata.name || '', }), }, }; @@ -61,7 +63,7 @@ const assembleServingRuntime = ( ...updatedServingRuntime.metadata.annotations, 'enable-route': externalRoute ? 'true' : 'false', 'enable-auth': tokenAuth ? 'true' : 'false', - 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', + 'opendatahub.io/accelerator-name': acceleratorState?.accelerator?.metadata.name || '', ...(isCustomServingRuntimesEnabled && { 'openshift.io/display-name': displayName.trim() }), }, }; @@ -81,15 +83,22 @@ const assembleServingRuntime = ( const { affinity, tolerations, resources } = assemblePodSpecOptions( resourceSettings, - accelerator, + acceleratorState, + undefined, + servingRuntime.spec.tolerations, + undefined, + updatedServingRuntime.spec.containers[0]?.resources, ); - updatedServingRuntime.spec.containers = servingRuntime.spec.containers.map((container) => ({ - ...container, - resources, - affinity, - tolerations, - })); + updatedServingRuntime.spec.containers = servingRuntime.spec.containers.map( + (container): ServingContainer => ({ + ...container, + resources, + affinity, + }), + ); + + servingRuntime.spec.tolerations = tolerations; return updatedServingRuntime; }; @@ -133,18 +142,22 @@ export const getServingRuntime = (name: string, namespace: string): Promise => { +export const updateServingRuntime = (options: { + data: CreatingServingRuntimeObject; + existingData: ServingRuntimeKind; + isCustomServingRuntimesEnabled: boolean; + opts?: K8sAPIOptions; + acceleratorState?: AcceleratorState; +}): Promise => { + const { data, existingData, isCustomServingRuntimesEnabled, opts, acceleratorState } = options; + const updatedServingRuntime = assembleServingRuntime( data, existingData.metadata.namespace, existingData, isCustomServingRuntimesEnabled, true, + acceleratorState, ); return k8sUpdateResource( @@ -155,18 +168,29 @@ export const updateServingRuntime = ( ); }; -export const createServingRuntime = ( - data: CreatingServingRuntimeObject, - namespace: string, - servingRuntime: ServingRuntimeKind, - isCustomServingRuntimesEnabled: boolean, - opts?: K8sAPIOptions, -): Promise => { +export const createServingRuntime = (options: { + data: CreatingServingRuntimeObject; + namespace: string; + servingRuntime: ServingRuntimeKind; + isCustomServingRuntimesEnabled: boolean; + opts?: K8sAPIOptions; + acceleratorState?: AcceleratorState; +}): Promise => { + const { + data, + namespace, + servingRuntime, + isCustomServingRuntimesEnabled, + opts, + acceleratorState, + } = options; const assembledServingRuntime = assembleServingRuntime( data, namespace, servingRuntime, isCustomServingRuntimesEnabled, + false, + acceleratorState, ); return k8sCreateResource( diff --git a/frontend/src/api/k8s/utils.ts b/frontend/src/api/k8s/utils.ts index 0b9498749f..ecbdfaa738 100644 --- a/frontend/src/api/k8s/utils.ts +++ b/frontend/src/api/k8s/utils.ts @@ -1,51 +1,49 @@ -import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; -import { - PodAffinity, - ContainerResources, - PodToleration, - TolerationSettings, - ContainerResourceAttributes, -} from '~/types'; +import { PodAffinity, ContainerResources, PodToleration, TolerationSettings } from '~/types'; import { determineTolerations } from '~/utilities/tolerations'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; export const assemblePodSpecOptions = ( resourceSettings: ContainerResources, - accelerator: AcceleratorState, + accelerator?: AcceleratorState, tolerationSettings?: TolerationSettings, + existingTolerations?: PodToleration[], affinitySettings?: PodAffinity, + existingResources?: ContainerResources, ): { affinity: PodAffinity; tolerations: PodToleration[]; resources: ContainerResources; } => { const affinity: PodAffinity = structuredClone(affinitySettings || {}); - const resources = structuredClone(resourceSettings); - if (accelerator.count > 0 && accelerator.accelerator) { - if (!resources.limits) { - resources.limits = {}; + let resources: ContainerResources = { + limits: { ...existingResources?.limits, ...resourceSettings?.limits }, + requests: { ...existingResources?.requests, ...resourceSettings?.requests }, + }; + + if (accelerator?.additionalOptions?.useExisting && !accelerator.useExisting) { + resources = structuredClone(resourceSettings); + } + + // Clear the last accelerator from the resources + if (accelerator?.initialAccelerator) { + if (resources.limits) { + delete resources.limits[accelerator.initialAccelerator.spec.identifier]; } - if (!resources.requests) { - resources.requests = {}; + if (resources.requests) { + delete resources.requests[accelerator.initialAccelerator.spec.identifier]; } - resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; - resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; - } else { - // step type down to string to avoid type errors - const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes); - - Object.keys(resources.limits || {}).forEach((key) => { - if (!containerResourceKeys.includes(key)) { - delete resources.limits?.[key]; - } - }); + } - Object.keys(resources.requests || {}).forEach((key) => { - if (!containerResourceKeys.includes(key)) { - delete resources.requests?.[key]; - } - }); + // Add back the new accelerator to the resources if count > 0 + if (accelerator?.accelerator && accelerator.count > 0) { + if (resources.limits) { + resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; + } + if (resources.requests) { + resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; + } } - const tolerations = determineTolerations(tolerationSettings, accelerator.accelerator); + const tolerations = determineTolerations(tolerationSettings, accelerator, existingTolerations); return { affinity, tolerations, resources }; }; diff --git a/frontend/src/components/SimpleDropdownSelect.tsx b/frontend/src/components/SimpleDropdownSelect.tsx index a987e0d270..9e76df5f4a 100644 --- a/frontend/src/components/SimpleDropdownSelect.tsx +++ b/frontend/src/components/SimpleDropdownSelect.tsx @@ -2,14 +2,16 @@ import * as React from 'react'; import { Dropdown, DropdownItem, DropdownToggle } from '@patternfly/react-core'; import './SimpleDropdownSelect.scss'; +export type SimpleDropdownOption = { + key: string; + label: React.ReactNode; + description?: React.ReactNode; + selectedLabel?: React.ReactNode; + isPlaceholder?: boolean; +}; + type SimpleDropdownProps = { - options: { - key: string; - label: React.ReactNode; - description?: React.ReactNode; - selectedLabel?: React.ReactNode; - isPlaceholder?: boolean; - }[]; + options: SimpleDropdownOption[]; value: string; placeholder?: string; onChange: (key: string, isPlaceholder: boolean) => void; diff --git a/frontend/src/k8sTypes.ts b/frontend/src/k8sTypes.ts index b834c009b1..0eaf184276 100644 --- a/frontend/src/k8sTypes.ts +++ b/frontend/src/k8sTypes.ts @@ -315,6 +315,14 @@ export type ServiceAccountKind = K8sResourceCommon & { }[]; }; +export type ServingContainer = { + args: string[]; + image: string; + name: string; + affinity?: PodAffinity; + resources: ContainerResources; +}; + export type ServingRuntimeKind = K8sResourceCommon & { metadata: { annotations?: DisplayNameAnnotations & ServingRuntimeAnnotations; @@ -328,14 +336,10 @@ export type ServingRuntimeKind = K8sResourceCommon & { memBufferBytes?: number; modelLoadingTimeoutMillis?: number; }; - containers: { - args: string[]; - image: string; - name: string; - resources: ContainerResources; - }[]; + containers: ServingContainer[]; supportedModelFormats: SupportedModelFormats[]; replicas: number; + tolerations?: PodToleration[]; }; }; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx index d80cfd7ad7..7c83de4ddd 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx @@ -8,10 +8,10 @@ import { List, ListItem, } from '@patternfly/react-core'; -import { ServingRuntimeKind } from '~/k8sTypes'; import { AppContext } from '~/app/AppContext'; +import { ServingRuntimeKind } from '~/k8sTypes'; import { getServingRuntimeSizes } from './utils'; -import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; +import useServingAccelerator from './useServingAccelerator'; type ServingRuntimeDetailsProps = { obj: ServingRuntimeKind; @@ -22,7 +22,7 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => const container = obj.spec.containers[0]; // can we assume the first container? const sizes = getServingRuntimeSizes(dashboardConfig); const size = sizes.find((size) => _.isEqual(size.resources, container.resources)); - const [accelerator] = useServingRuntimeAccelerator(obj); + const [accelerator] = useServingAccelerator(obj); return ( diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx index 393bae629d..ec99407ebf 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ManageServingRuntimeModal.tsx @@ -36,6 +36,7 @@ import { import useCustomServingRuntimesEnabled from '~/pages/modelServing/customServingRuntimes/useCustomServingRuntimesEnabled'; import { getServingRuntimeFromName } from '~/pages/modelServing/customServingRuntimes/utils'; import { translateDisplayNameForK8s } from '~/pages/projects/utils'; +import useServingAccelerator from '~/pages/modelServing/screens/projects/useServingAccelerator'; import ServingRuntimeReplicaSection from './ServingRuntimeReplicaSection'; import ServingRuntimeSizeSection from './ServingRuntimeSizeSection'; import ServingRuntimeTokenSection from './ServingRuntimeTokenSection'; @@ -69,6 +70,9 @@ const ManageServingRuntimeModal: React.FC = ({ editInfo, }) => { const [createData, setCreateData, resetData, sizes] = useCreateServingRuntimeObject(editInfo); + const [acceleratorState, setAcceleratorState, resetAcceleratorData] = useServingAccelerator( + editInfo?.servingRuntime, + ); const [actionInProgress, setActionInProgress] = React.useState(false); const [error, setError] = React.useState(); @@ -106,6 +110,7 @@ const ManageServingRuntimeModal: React.FC = ({ setError(undefined); setActionInProgress(false); resetData(); + resetAcceleratorData(); }; const setErrorModal = (error: Error) => { @@ -127,35 +132,39 @@ const ManageServingRuntimeModal: React.FC = ({ } const servingRuntimeData = { ...createData, - accelerator: isGpuDisabled(servingRuntimeSelected) - ? { accelerator: undefined, count: 0 } - : createData.accelerator, + existingTolerations: servingRuntimeSelected.spec.tolerations || [], }; const servingRuntimeName = translateDisplayNameForK8s(servingRuntimeData.name); const createRolebinding = servingRuntimeData.tokenAuth && allowCreate; + const accelerator = isGpuDisabled(servingRuntimeSelected) + ? { count: 0, accelerators: [], useExisting: false } + : acceleratorState; + Promise.all([ ...(editInfo?.servingRuntime ? [ - updateServingRuntime( - servingRuntimeData, - editInfo?.servingRuntime, - customServingRuntimesEnabled, - { + updateServingRuntime({ + data: servingRuntimeData, + existingData: editInfo?.servingRuntime, + isCustomServingRuntimesEnabled: customServingRuntimesEnabled, + opts: { dryRun: true, }, - ), + acceleratorState: accelerator, + }), ] : [ - createServingRuntime( - servingRuntimeData, + createServingRuntime({ + data: servingRuntimeData, namespace, - servingRuntimeSelected, - customServingRuntimesEnabled, - { + servingRuntime: servingRuntimeSelected, + isCustomServingRuntimesEnabled: customServingRuntimesEnabled, + opts: { dryRun: true, }, - ), + acceleratorState: accelerator, + }), ]), setUpTokenAuth( servingRuntimeData, @@ -175,19 +184,22 @@ const ManageServingRuntimeModal: React.FC = ({ : []), ...(editInfo?.servingRuntime ? [ - updateServingRuntime( - servingRuntimeData, - editInfo?.servingRuntime, - customServingRuntimesEnabled, - ), + updateServingRuntime({ + data: servingRuntimeData, + existingData: editInfo?.servingRuntime, + isCustomServingRuntimesEnabled: customServingRuntimesEnabled, + + acceleratorState: accelerator, + }), ] : [ - createServingRuntime( - servingRuntimeData, + createServingRuntime({ + data: servingRuntimeData, namespace, - servingRuntimeSelected, - customServingRuntimesEnabled, - ), + servingRuntime: servingRuntimeSelected, + isCustomServingRuntimesEnabled: customServingRuntimesEnabled, + acceleratorState: accelerator, + }), ]), setUpTokenAuth( servingRuntimeData, @@ -246,6 +258,7 @@ const ManageServingRuntimeModal: React.FC = ({ setData={setCreateData} templates={servingRuntimeTemplates || []} isEditing={!!editInfo} + acceleratorState={acceleratorState} /> @@ -256,6 +269,8 @@ const ManageServingRuntimeModal: React.FC = ({ setData={setCreateData} sizes={sizes} servingRuntimeSelected={servingRuntimeSelected} + acceleratorState={acceleratorState} + setAcceleratorState={setAcceleratorState} /> diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx index 26432ffca8..f05b359335 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx @@ -16,6 +16,7 @@ import { ServingRuntimeKind } from '~/k8sTypes'; import { isGpuDisabled } from '~/pages/modelServing/screens/projects/utils'; import AcceleratorSelectField from '~/pages/notebookController/screens/server/AcceleratorSelectField'; import { getCompatibleAcceleratorIdentifiers } from '~/pages/projects/screens/spawner/spawnerUtils'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; import ServingRuntimeSizeExpandedField from './ServingRuntimeSizeExpandedField'; type ServingRuntimeSizeSectionProps = { @@ -23,6 +24,8 @@ type ServingRuntimeSizeSectionProps = { setData: UpdateObjectAtPropAndValue; sizes: ServingRuntimeSize[]; servingRuntimeSelected?: ServingRuntimeKind; + acceleratorState: AcceleratorState; + setAcceleratorState: UpdateObjectAtPropAndValue; }; const ServingRuntimeSizeSection: React.FC = ({ @@ -30,6 +33,8 @@ const ServingRuntimeSizeSection: React.FC = ({ setData, sizes, servingRuntimeSelected, + acceleratorState, + setAcceleratorState, }) => { const [sizeDropdownOpen, setSizeDropdownOpen] = React.useState(false); const [supportedAccelerators, setSupportedAccelerators] = React.useState(); @@ -99,12 +104,8 @@ const ServingRuntimeSizeSection: React.FC = ({ {!gpuDisabled && ( - setData('accelerator', { ...data.accelerator, accelerator }) - } - setAcceleratorCount={(count) => setData('accelerator', { ...data.accelerator, count })} + acceleratorState={acceleratorState} + setAcceleratorState={setAcceleratorState} supportedAccelerators={supportedAccelerators} supportedText="Compatible with serving runtime" /> diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx index fddc781f66..2cee7c6af1 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeTemplateSection.tsx @@ -9,12 +9,14 @@ import { } from '~/pages/modelServing/customServingRuntimes/utils'; import { isCompatibleWithAccelerator } from '~/pages/projects/screens/spawner/spawnerUtils'; import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; type ServingRuntimeTemplateSectionProps = { data: CreatingServingRuntimeObject; setData: UpdateObjectAtPropAndValue; templates: TemplateKind[]; isEditing?: boolean; + acceleratorState: AcceleratorState; }; const ServingRuntimeTemplateSection: React.FC = ({ @@ -22,6 +24,7 @@ const ServingRuntimeTemplateSection: React.FC { const options = templates.map((template) => ({ key: getServingRuntimeNameFromTemplate(template), @@ -32,7 +35,7 @@ const ServingRuntimeTemplateSection: React.FC {isCompatibleWithAccelerator( - data.accelerator.accelerator?.spec.identifier, + acceleratorState.accelerator?.spec.identifier, template.objects[0], ) && } diff --git a/frontend/src/pages/modelServing/screens/projects/useServingAccelerator.ts b/frontend/src/pages/modelServing/screens/projects/useServingAccelerator.ts new file mode 100644 index 0000000000..fa20a86e55 --- /dev/null +++ b/frontend/src/pages/modelServing/screens/projects/useServingAccelerator.ts @@ -0,0 +1,15 @@ +import { ServingRuntimeKind } from '~/k8sTypes'; +import useAcceleratorState, { AcceleratorState } from '~/utilities/useAcceleratorState'; +import { GenericObjectState } from '~/utilities/useGenericObjectState'; + +const useServingAccelerator = ( + servingRuntime?: ServingRuntimeKind | null, +): GenericObjectState => { + const acceleratorName = servingRuntime?.metadata.annotations?.['opendatahub.io/accelerator-name']; + const resources = servingRuntime?.spec.containers[0].resources; + const tolerations = servingRuntime?.spec.tolerations; + + return useAcceleratorState(resources, tolerations, acceleratorName); +}; + +export default useServingAccelerator; diff --git a/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts b/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts deleted file mode 100644 index fae8b158b9..0000000000 --- a/frontend/src/pages/modelServing/screens/projects/useServingRuntimeAccelerator.ts +++ /dev/null @@ -1,42 +0,0 @@ -import React, { useRef } from 'react'; -import { ServingRuntimeKind } from '~/k8sTypes'; -import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; -import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; -import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; - -const useServingRuntimeAccelerator = ( - servingRuntime?: ServingRuntimeKind, -): GenericObjectState => { - const [acceleratorState, setData, resetData] = useGenericObjectState({ - accelerator: undefined, - count: 0, - }); - - const hasSet = useRef(false); - - const [accelerators, loaded, loadError] = useAccelerators(); - - React.useEffect(() => { - if ( - loaded && - !loadError && - servingRuntime && - servingRuntime?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && - !hasSet.current - ) { - const name = servingRuntime.metadata.annotations['opendatahub.io/accelerator-name']; - const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); - const container = servingRuntime?.spec.containers[0]; - - if (accelerator && container) { - hasSet.current = true; - setData('accelerator', accelerator); - setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier]) ?? 0); - } - } - }, [accelerators, loaded, loadError, servingRuntime, setData]); - - return [acceleratorState, setData, resetData]; -}; - -export default useServingRuntimeAccelerator; diff --git a/frontend/src/pages/modelServing/screens/projects/utils.ts b/frontend/src/pages/modelServing/screens/projects/utils.ts index 54ef4e6d20..a800389c3e 100644 --- a/frontend/src/pages/modelServing/screens/projects/utils.ts +++ b/frontend/src/pages/modelServing/screens/projects/utils.ts @@ -16,7 +16,6 @@ import { EMPTY_AWS_SECRET_DATA } from '~/pages/projects/dataConnections/const'; import { getDisplayNameFromK8sResource } from '~/pages/projects/utils'; import { getDisplayNameFromServingRuntimeTemplate } from '~/pages/modelServing/customServingRuntimes/utils'; import { isCpuLimitEqual, isMemoryLimitEqual } from '~/utilities/valueUnits'; -import useServingRuntimeAccelerator from './useServingRuntimeAccelerator'; export const getServingRuntimeSizes = (config: DashboardConfig): ServingRuntimeSize[] => { let sizes = config.spec.modelServerSizes || []; @@ -55,8 +54,6 @@ export const useCreateServingRuntimeObject = (existingData?: { ] => { const { dashboardConfig } = useAppContext(); - const [existingAccelerator] = useServingRuntimeAccelerator(existingData?.servingRuntime); - const sizes = useDeepCompareMemoize(getServingRuntimeSizes(dashboardConfig)); const createModelState = useGenericObjectState({ @@ -64,7 +61,6 @@ export const useCreateServingRuntimeObject = (existingData?: { servingRuntimeTemplateName: '', numReplicas: 1, modelSize: sizes[0], - accelerator: existingAccelerator, externalRoute: false, tokenAuth: false, tokens: [], @@ -116,7 +112,6 @@ export const useCreateServingRuntimeObject = (existingData?: { resources: existingResources, }, ); - setCreateData('accelerator', existingAccelerator); setCreateData('externalRoute', existingExternalRoute); setCreateData('tokenAuth', existingTokenAuth); setCreateData('tokens', existingTokens); @@ -126,7 +121,6 @@ export const useCreateServingRuntimeObject = (existingData?: { existingServingRuntimeTemplateName, existingNumReplicas, existingResources, - existingAccelerator, existingExternalRoute, existingTokenAuth, existingTokens, diff --git a/frontend/src/pages/modelServing/screens/types.ts b/frontend/src/pages/modelServing/screens/types.ts index 557f7bda26..05b720e68d 100644 --- a/frontend/src/pages/modelServing/screens/types.ts +++ b/frontend/src/pages/modelServing/screens/types.ts @@ -1,4 +1,3 @@ -import { AcceleratorState } from '~/pages/projects/screens/detail/notebooks/useNotebookAccelerator'; import { EnvVariableDataEntry } from '~/pages/projects/types'; import { ContainerResources } from '~/types'; @@ -35,7 +34,6 @@ export type CreatingServingRuntimeObject = { servingRuntimeTemplateName: string; numReplicas: number; modelSize: ServingRuntimeSize; - accelerator: AcceleratorState; externalRoute: boolean; tokenAuth: boolean; tokens: ServingRuntimeToken[]; diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx index 003dc0412c..13580197f8 100644 --- a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -13,85 +13,73 @@ import { } from '@patternfly/react-core'; import { isHTMLInputElement } from '~/utilities/utils'; import { AcceleratorKind } from '~/k8sTypes'; -import SimpleDropdownSelect from '~/components/SimpleDropdownSelect'; -import useAccelerators from './useAccelerators'; +import SimpleDropdownSelect, { SimpleDropdownOption } from '~/components/SimpleDropdownSelect'; +import { UpdateObjectAtPropAndValue } from '~/pages/projects/types'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; import useAcceleratorCounts from './useAcceleratorCounts'; type AcceleratorSelectFieldProps = { - accelerator?: AcceleratorKind; - setAccelerator: (accelerator?: AcceleratorKind) => void; - acceleratorCount?: number; - setAcceleratorCount: (size: number) => void; + acceleratorState: AcceleratorState; + setAcceleratorState: UpdateObjectAtPropAndValue; supportedAccelerators?: string[]; supportedText?: string; }; const AcceleratorSelectField: React.FC = ({ - accelerator, - setAccelerator, - acceleratorCount = 0, - setAcceleratorCount, + acceleratorState, + setAcceleratorState, supportedAccelerators, supportedText, }) => { - const [accelerators, loaded, loadError] = useAccelerators(); const [detectedAcceleratorInfo] = useAcceleratorCounts(); - const validateAcceleratorCount = React.useCallback( - (newSize: number) => { - if (!accelerator) { - return ''; - } + const { + accelerator, + count: acceleratorCount, + accelerators, + useExisting, + additionalOptions, + } = acceleratorState; - const detectedAcceleratorCount = Object.entries(detectedAcceleratorInfo.available).find( - ([identifier]) => accelerator?.spec.identifier === identifier, - )?.[1]; + const generateAcceleratorCountWarning = (newSize: number) => { + if (!accelerator) { + return ''; + } - if (detectedAcceleratorCount === undefined) { - return `No accelerator detected with the identifier ${accelerator?.spec.identifier}.`; - } else if (newSize > detectedAcceleratorCount) { - return `Only ${detectedAcceleratorCount} accelerator${ - detectedAcceleratorCount > 1 ? 's' : '' - } detected.`; - } + const identifier = accelerator?.spec.identifier; - return ''; - }, - [accelerator, detectedAcceleratorInfo.available], - ); + const detectedAcceleratorCount = Object.entries(detectedAcceleratorInfo.available).find( + ([id]) => identifier === id, + )?.[1]; - React.useEffect(() => { - setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); - }, [acceleratorCount, validateAcceleratorCount]); + if (detectedAcceleratorCount === undefined) { + return `No accelerator detected with the identifier ${identifier}.`; + } else if (newSize > detectedAcceleratorCount) { + return `Only ${detectedAcceleratorCount} accelerator${ + detectedAcceleratorCount > 1 ? 's' : '' + } detected.`; + } - const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState( - validateAcceleratorCount(acceleratorCount), - ); + return ''; + }; + + const acceleratorCountWarning = generateAcceleratorCountWarning(acceleratorCount); const isAcceleratorSupported = (accelerator: AcceleratorKind) => supportedAccelerators?.includes(accelerator.spec.identifier); const enabledAccelerators = accelerators.filter((ac) => ac.spec.enabled); - const options = enabledAccelerators - .sort((a, b) => { - const aSupported = isAcceleratorSupported(a); - const bSupported = isAcceleratorSupported(b); - if (aSupported && !bSupported) { - return -1; - } - if (!aSupported && bSupported) { - return 1; - } - return 0; - }) - .map((ac) => ({ + const formatOption = (ac: AcceleratorKind): SimpleDropdownOption => { + const displayName = `${ac.spec.displayName}${!ac.spec.enabled ? ' (disabled)' : ''}`; + + return { key: ac.metadata.name, - selectedLabel: ac.spec.displayName, + selectedLabel: displayName, description: ac.spec.description, label: ( - {ac.spec.displayName} + {displayName} {isAcceleratorSupported(ac) && ( @@ -100,7 +88,22 @@ const AcceleratorSelectField: React.FC = ({ ), - })); + }; + }; + + const options: SimpleDropdownOption[] = enabledAccelerators + .sort((a, b) => { + const aSupported = isAcceleratorSupported(a); + const bSupported = isAcceleratorSupported(b); + if (aSupported && !bSupported) { + return -1; + } + if (!aSupported && bSupported) { + return 1; + } + return 0; + }) + .map((ac) => formatOption(ac)); let acceleratorAlertMessage: { title: string; variant: AlertVariant } | null = null; if (accelerator && supportedAccelerators !== undefined) { @@ -118,12 +121,30 @@ const AcceleratorSelectField: React.FC = ({ } } + // add none option + options.push({ + key: '', + label: 'None', + isPlaceholder: true, + }); + + if (additionalOptions?.useExisting) { + options.push({ + key: 'use-existing', + label: 'Existing settings', + description: 'Use the existing accelerator settings from the notebook server', + }); + } else if (additionalOptions?.useDisabled) { + options.push(formatOption(additionalOptions?.useDisabled)); + } + const onStep = (step: number) => { - setAcceleratorCount(Math.max(acceleratorCount + step, 0)); + setAcceleratorState('count', Math.max(acceleratorCount + step, 0)); }; - if (!loaded || loadError || enabledAccelerators.length === 0) { - return <>; + // if there is more than a none option, show the dropdown + if (options.length === 1) { + return null; } return ( @@ -132,24 +153,29 @@ const AcceleratorSelectField: React.FC = ({ { if (isPlaceholder) { - setAccelerator(undefined); - setAcceleratorCount(0); + // none + setAcceleratorState('useExisting', false); + setAcceleratorState('accelerator', undefined); + setAcceleratorState('count', 0); + } else if (key === 'use-existing') { + // use existing settings + setAcceleratorState('useExisting', true); + setAcceleratorState('accelerator', undefined); + setAcceleratorState('count', 0); } else { - setAccelerator(accelerators.find((ac) => ac.metadata.name === key)); + // normal flow + setAcceleratorState('useExisting', false); + setAcceleratorState( + 'accelerator', + accelerators.find((ac) => ac.metadata.name === key), + ); } }} - > + /> {acceleratorAlertMessage && ( @@ -178,7 +204,7 @@ const AcceleratorSelectField: React.FC = ({ onChange={(event) => { if (isHTMLInputElement(event.target)) { const newSize = Number(event.target.value); - setAcceleratorCount(newSize); + setAcceleratorState('count', newSize); } }} /> diff --git a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx index 985773e1e3..48be7f4342 100644 --- a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx +++ b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx @@ -310,10 +310,8 @@ const SpawnerPage: React.FC = () => { sizes={sizes} /> setAccelerator('accelerator', accelerator)} - acceleratorCount={accelerator.count} - setAcceleratorCount={(acceleratorCount) => setAccelerator('count', acceleratorCount)} + acceleratorState={accelerator} + setAcceleratorState={setAccelerator} /> diff --git a/frontend/src/pages/notebookController/screens/server/useAccelerators.ts b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts index 059349a650..d4f9545077 100644 --- a/frontend/src/pages/notebookController/screens/server/useAccelerators.ts +++ b/frontend/src/pages/notebookController/screens/server/useAccelerators.ts @@ -1,8 +1,11 @@ +import React from 'react'; import useFetchState, { FetchState } from '~/utilities/useFetchState'; import { AcceleratorKind } from '~/k8sTypes'; import { listAccelerators } from '~/api'; -const useAccelerators = (): FetchState => - useFetchState(listAccelerators, []); +const useAccelerators = (namespace: string): FetchState => { + const getAccelerators = React.useCallback(() => listAccelerators(namespace), [namespace]); + return useFetchState(getAccelerators, []); +}; export default useAccelerators; diff --git a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts index 14abb51958..cd01955ad2 100644 --- a/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts +++ b/frontend/src/pages/projects/screens/detail/notebooks/useNotebookAccelerator.ts @@ -1,49 +1,18 @@ -import React, { useRef } from 'react'; -import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; -import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; -import { Notebook, NotebookContainer } from '~/types'; -import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; - -export type AcceleratorState = { - accelerator?: AcceleratorKind; - count: number; -}; +import { NotebookKind } from '~/k8sTypes'; +import { Notebook } from '~/types'; +import useAcceleratorState, { AcceleratorState } from '~/utilities/useAcceleratorState'; +import { GenericObjectState } from '~/utilities/useGenericObjectState'; const useNotebookAccelerator = ( notebook?: NotebookKind | Notebook | null, ): GenericObjectState => { - const [acceleratorState, setData, resetData] = useGenericObjectState({ - accelerator: undefined, - count: 0, - }); - - const hasSet = useRef(false); - - const [accelerators, loaded, loadError] = useAccelerators(); - - React.useEffect(() => { - if ( - loaded && - !loadError && - notebook && - notebook?.metadata?.annotations?.['opendatahub.io/accelerator-name'] && - !hasSet.current - ) { - notebook.spec.template; - const name = notebook.metadata.annotations['opendatahub.io/accelerator-name']; - const accelerator = accelerators.find((accelerator) => accelerator.metadata.name === name); - const container: NotebookContainer | undefined = notebook?.spec.template.spec.containers.find( - (container) => container.name === notebook.metadata.name, - ); - if (accelerator && container) { - hasSet.current = true; - setData('accelerator', accelerator); - setData('count', Number(container.resources?.limits?.[accelerator.spec.identifier] ?? 0)); - } - } - }, [accelerators, loaded, loadError, notebook, setData]); + const acceleratorName = notebook?.metadata.annotations?.['opendatahub.io/accelerator-name']; + const resources = notebook?.spec.template.spec.containers.find( + (container) => container.name === notebook.metadata.name, + )?.resources; + const tolerations = notebook?.spec.template.spec.tolerations; - return [acceleratorState, setData, resetData]; + return useAcceleratorState(resources, tolerations, acceleratorName); }; export default useNotebookAccelerator; diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx index 860104d8fa..83730ff768 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerPage.tsx @@ -181,14 +181,8 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { value={selectedSize} /> - setNotebookAcceleratorState('accelerator', accelerator) - } - acceleratorCount={notebookAcceleratorState.count} - setAcceleratorCount={(acceleratorCount) => - setNotebookAcceleratorState('count', acceleratorCount) - } + acceleratorState={notebookAcceleratorState} + setAcceleratorState={setNotebookAcceleratorState} supportedAccelerators={supportedAccelerators} /> @@ -250,6 +244,8 @@ const SpawnerPage: React.FC = ({ existingNotebook }) => { accelerator: notebookAcceleratorState, volumes: [], volumeMounts: [], + existingTolerations: existingNotebook?.spec.template.spec.tolerations || [], + existingResources: existingNotebook?.spec.template.spec.containers[0].resources, }} storageData={storageData} envVariables={envVariables} diff --git a/frontend/src/pages/projects/types.ts b/frontend/src/pages/projects/types.ts index abcf01cd16..870eee7658 100644 --- a/frontend/src/pages/projects/types.ts +++ b/frontend/src/pages/projects/types.ts @@ -1,14 +1,16 @@ import { + ContainerResources, ImageStreamAndVersion, NotebookSize, + PodToleration, TolerationSettings, Volume, VolumeMount, } from '~/types'; import { ValueOf } from '~/typeHelpers'; import { AWSSecretKind } from '~/k8sTypes'; +import { AcceleratorState } from '~/utilities/useAcceleratorState'; import { AWS_KEYS } from './dataConnections/const'; -import { AcceleratorState } from './screens/detail/notebooks/useNotebookAccelerator'; export type UpdateObjectAtPropAndValue = (propKey: keyof T, propValue: ValueOf) => void; @@ -66,6 +68,8 @@ export type StartNotebookData = { volumes?: Volume[]; volumeMounts?: VolumeMount[]; tolerationSettings?: TolerationSettings; + existingTolerations?: PodToleration[]; + existingResources?: ContainerResources; envFrom?: EnvironmentFromVariable[]; description?: string; /** An override for the assembleNotebook so it doesn't regen an id */ diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 5cceccba3b..4bb4fa18af 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -6,7 +6,7 @@ import { ServingRuntimeSize } from '~/pages/modelServing/screens/types'; import { EnvironmentFromVariable } from '~/pages/projects/types'; import { ImageStreamKind, ImageStreamSpecTagType } from './k8sTypes'; import { EitherNotBoth } from './typeHelpers'; -import { AcceleratorState } from './pages/projects/screens/detail/notebooks/useNotebookAccelerator'; +import { AcceleratorState } from './utilities/useAcceleratorState'; export type PrometheusQueryResponse = { data: { diff --git a/frontend/src/utilities/tolerations.ts b/frontend/src/utilities/tolerations.ts index f381e9fd5a..5878656918 100644 --- a/frontend/src/utilities/tolerations.ts +++ b/frontend/src/utilities/tolerations.ts @@ -1,6 +1,8 @@ import { Patch } from '@openshift/dynamic-plugin-sdk-utils'; +import _ from 'lodash'; import { DashboardConfig, PodToleration, TolerationSettings } from '~/types'; -import { AcceleratorKind, NotebookKind } from '~/k8sTypes'; +import { NotebookKind } from '~/k8sTypes'; +import { AcceleratorState } from './useAcceleratorState'; export type TolerationChanges = { type: 'add' | 'remove' | 'replace' | 'nothing'; @@ -9,14 +11,34 @@ export type TolerationChanges = { export const determineTolerations = ( tolerationSettings?: TolerationSettings, - accelerator?: AcceleratorKind, + acceleratorState?: AcceleratorState, + existingTolerations?: PodToleration[], ): PodToleration[] => { - const tolerations: PodToleration[] = []; + let tolerations = existingTolerations || []; - if (accelerator?.spec.tolerations) { - tolerations.push(...accelerator.spec.tolerations); + // remove old accelerator tolerations if they exist + if (acceleratorState?.initialAccelerator) { + tolerations = tolerations.filter( + (t) => !acceleratorState.initialAccelerator?.spec.tolerations?.some((t2) => _.isEqual(t2, t)), + ); } - if (tolerationSettings?.enabled) { + + // add new accelerator tolerations if they exist + if (acceleratorState?.accelerator?.spec.tolerations) { + tolerations.push(...acceleratorState.accelerator.spec.tolerations); + } + + // remove duplicated tolerations + tolerations = _.uniqWith(tolerations, _.isEqual); + + // add toleration from settings if they exist + if ( + tolerationSettings?.enabled && + !tolerations.some( + (t) => + t.key === tolerationSettings.key && t.operator === 'Exists' && t.effect === 'NoSchedule', + ) + ) { tolerations.push({ effect: 'NoSchedule', key: tolerationSettings.key, diff --git a/frontend/src/utilities/useAcceleratorState.ts b/frontend/src/utilities/useAcceleratorState.ts new file mode 100644 index 0000000000..759a61691a --- /dev/null +++ b/frontend/src/utilities/useAcceleratorState.ts @@ -0,0 +1,131 @@ +import React from 'react'; +import { AcceleratorKind } from '~/k8sTypes'; +import useAccelerators from '~/pages/notebookController/screens/server/useAccelerators'; +import { useDashboardNamespace } from '~/redux/selectors'; +import { ContainerResourceAttributes, ContainerResources, PodToleration } from '~/types'; +import useGenericObjectState, { GenericObjectState } from '~/utilities/useGenericObjectState'; + +export type AcceleratorState = { + accelerator?: AcceleratorKind; + accelerators: AcceleratorKind[]; + initialAccelerator?: AcceleratorKind; + useExisting: boolean; + count: number; + additionalOptions?: { + useExisting?: boolean; + useDisabled?: AcceleratorKind; + }; +}; + +const useAcceleratorState = ( + resources?: ContainerResources, + tolerations?: PodToleration[], + existingAcceleratorName?: string, +): GenericObjectState => { + const [acceleratorState, setData, resetData] = useGenericObjectState({ + accelerator: undefined, + accelerators: [], + initialAccelerator: undefined, + count: 0, + useExisting: false, + }); + + const { dashboardNamespace } = useDashboardNamespace(); + const [accelerators, loaded, loadError] = useAccelerators(dashboardNamespace); + + React.useEffect(() => { + if (loaded && !loadError) { + setData('accelerators', accelerators); + + // Exit early if no resources = not in edit mode + if (!resources) { + return; + } + + const accelerator = accelerators.find( + (accelerator) => accelerator.metadata.name === existingAcceleratorName, + ); + + if (accelerator) { + setData('accelerator', accelerator); + setData('initialAccelerator', accelerator); + setData('count', Number(resources.requests?.[accelerator.spec.identifier] ?? 0)); + if (!accelerator.spec.enabled) { + setData('additionalOptions', { useDisabled: accelerator }); + } + } else { + // check if there is accelerator usage in the container + // this is to handle the case where the accelerator is disabled, deleted, or empty + const containerResourceAttributes = Object.values(ContainerResourceAttributes) as string[]; + const possibleAcceleratorsIdentifiers = Object.entries(resources.requests ?? {}).filter( + ([key]) => !containerResourceAttributes.includes(key), + ); + if (possibleAcceleratorsIdentifiers.length > 0) { + // check if they are just using the nvidia.com/gpu + // if so, lets migrate them over to using the migrated-gpu accelerator profile if it exists + const acceleratorRequest = possibleAcceleratorsIdentifiers.find( + (possibleAcceleratorIdentifiers) => + possibleAcceleratorIdentifiers[0] === 'nvidia.com/gpu', + ); + + if ( + acceleratorRequest && + tolerations?.some( + (toleration) => + toleration.key === 'nvidia.com/gpu' && + toleration.operator === 'Exists' && + toleration.effect === 'NoSchedule', + ) + ) { + const migratedAccelerator = accelerators.find( + (accelerator) => accelerator.metadata.name === 'migrated-gpu', + ); + + if (migratedAccelerator) { + setData('accelerator', migratedAccelerator); + setData('initialAccelerator', migratedAccelerator); + setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0)); + if (!migratedAccelerator.spec.enabled) { + setData('additionalOptions', { useDisabled: accelerator }); + } + } else { + // create a fake accelerator to use + const fakeAccelerator: AcceleratorKind = { + apiVersion: 'dashboard.opendatahub.io/v1alpha', + kind: 'AcceleratorProfile', + metadata: { + name: 'migrated-gpu', + }, + spec: { + identifier: 'nvidia.com/gpu', + displayName: 'Nvidia GPU', + enabled: true, + tolerations: [ + { + key: 'nvidia.com/gpu', + operator: 'Exists', + effect: 'NoSchedule', + }, + ], + }, + }; + + setData('accelerator', fakeAccelerator); + setData('accelerators', [fakeAccelerator, ...accelerators]); + setData('initialAccelerator', fakeAccelerator); + setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0)); + } + } else { + // fallback to using the existing accelerator + setData('useExisting', true); + setData('additionalOptions', { useExisting: true }); + } + } + } + } + }, [accelerators, loaded, loadError, resources, tolerations, existingAcceleratorName, setData]); + + return [acceleratorState, setData, resetData]; +}; + +export default useAcceleratorState; diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml index 51b5798502..d21d47cd04 100644 --- a/manifests/base/cluster-role.yaml +++ b/manifests/base/cluster-role.yaml @@ -3,6 +3,13 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: odh-dashboard rules: + - verbs: + - get + - list + apiGroups: + - '' + resources: + - nodes - verbs: - get - list From a4bb17268272bff63a1cffc17e857d42ec6c2f2d Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 5 Sep 2023 15:29:02 -0500 Subject: [PATCH 16/22] add rbac accelerator role --- manifests/base/fetch-accelerators.rbac.yaml | 26 +++++++++++++++++++++ manifests/base/kustomization.yaml | 1 + 2 files changed, 27 insertions(+) create mode 100644 manifests/base/fetch-accelerators.rbac.yaml diff --git a/manifests/base/fetch-accelerators.rbac.yaml b/manifests/base/fetch-accelerators.rbac.yaml new file mode 100644 index 0000000000..d11b7dc3ad --- /dev/null +++ b/manifests/base/fetch-accelerators.rbac.yaml @@ -0,0 +1,26 @@ +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fetch-accelerators-role +rules: + - apiGroups: + - dashboard.opendatahub.io + verbs: + - get + - list + - watch + resources: + - acceleratorprofiles +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: accelerators +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: fetch-accelerators-role +subjects: + - apiGroup: rbac.authorization.k8s.io + kind: Group + name: system:authenticated \ No newline at end of file diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 4fe93e8917..3595eb216d 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -21,6 +21,7 @@ resources: - image-puller.clusterrolebinding.yaml - model-serving-role.yaml - model-serving-role-binding.yaml + - fetch-accelerators.rbac.yaml images: - name: odh-dashboard newName: quay.io/opendatahub/odh-dashboard From 2a300bc6518519a34211ab59edce89fd34543bd2 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 5 Sep 2023 15:37:21 -0500 Subject: [PATCH 17/22] revert add rbac accelerator role --- manifests/base/fetch-accelerators.rbac.yaml | 26 --------------------- manifests/base/kustomization.yaml | 1 - 2 files changed, 27 deletions(-) delete mode 100644 manifests/base/fetch-accelerators.rbac.yaml diff --git a/manifests/base/fetch-accelerators.rbac.yaml b/manifests/base/fetch-accelerators.rbac.yaml deleted file mode 100644 index d11b7dc3ad..0000000000 --- a/manifests/base/fetch-accelerators.rbac.yaml +++ /dev/null @@ -1,26 +0,0 @@ -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: fetch-accelerators-role -rules: - - apiGroups: - - dashboard.opendatahub.io - verbs: - - get - - list - - watch - resources: - - acceleratorprofiles ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: accelerators -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: fetch-accelerators-role -subjects: - - apiGroup: rbac.authorization.k8s.io - kind: Group - name: system:authenticated \ No newline at end of file diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 3595eb216d..4fe93e8917 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -21,7 +21,6 @@ resources: - image-puller.clusterrolebinding.yaml - model-serving-role.yaml - model-serving-role-binding.yaml - - fetch-accelerators.rbac.yaml images: - name: odh-dashboard newName: quay.io/opendatahub/odh-dashboard From 84ca02ed57afc15691997f9cebfc9275504b72d6 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 5 Sep 2023 15:43:55 -0500 Subject: [PATCH 18/22] add rbac accelerator role --- manifests/base/fetch-accelerators.rbac.yaml | 26 +++++++++++++++++++++ manifests/base/kustomization.yaml | 1 + 2 files changed, 27 insertions(+) create mode 100644 manifests/base/fetch-accelerators.rbac.yaml diff --git a/manifests/base/fetch-accelerators.rbac.yaml b/manifests/base/fetch-accelerators.rbac.yaml new file mode 100644 index 0000000000..d11b7dc3ad --- /dev/null +++ b/manifests/base/fetch-accelerators.rbac.yaml @@ -0,0 +1,26 @@ +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fetch-accelerators-role +rules: + - apiGroups: + - dashboard.opendatahub.io + verbs: + - get + - list + - watch + resources: + - acceleratorprofiles +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: accelerators +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: fetch-accelerators-role +subjects: + - apiGroup: rbac.authorization.k8s.io + kind: Group + name: system:authenticated \ No newline at end of file diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 4fe93e8917..3595eb216d 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -21,6 +21,7 @@ resources: - image-puller.clusterrolebinding.yaml - model-serving-role.yaml - model-serving-role-binding.yaml + - fetch-accelerators.rbac.yaml images: - name: odh-dashboard newName: quay.io/opendatahub/odh-dashboard From a92ab5b01c008bc5f136c00d1e0afd0c11d8f146 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Wed, 6 Sep 2023 09:58:50 -0500 Subject: [PATCH 19/22] accelerator minor bug fixes making image/servingruntime naming dynamic fix count going back to 0 prevent 0 count and ux style fix removed usage of unknown when not needed remove double array usage improved backend logging fix logging undefined error make ?? consistent fixed "||" and fixed unknown / none details --- backend/src/plugins/kube.ts | 2 +- .../api/accelerators/acceleratorUtils.ts | 12 +++++---- backend/src/utils/resourceUtils.ts | 25 +++++++++++++------ .../src/components/SimpleDropdownSelect.tsx | 2 +- .../projects/ServingRuntimeDetails.tsx | 16 ++++++++---- .../ServingRuntimeSizeSection.tsx | 2 +- .../screens/server/AcceleratorSelectField.tsx | 18 ++++++------- .../screens/server/NotebookServerDetails.tsx | 16 ++++++++---- .../screens/server/SpawnerPage.tsx | 8 ++++-- .../notebook/NotebookStatusToggle.tsx | 8 +++--- .../screens/spawner/SpawnerFooter.tsx | 8 ++++-- frontend/src/utilities/useAcceleratorState.ts | 19 +++++++------- 12 files changed, 85 insertions(+), 51 deletions(-) diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts index 1a16523676..9cdba8d717 100644 --- a/backend/src/plugins/kube.ts +++ b/backend/src/plugins/kube.ts @@ -85,7 +85,7 @@ export default fp(async (fastify: FastifyInstance) => { cleanupGPU(fastify).catch((e) => fastify.log.error( `Unable to fully convert GPU to use accelerator profiles. ${ - e.response?.body?.message || e.message + e.response?.body?.message || e.message || e }`, ), ); diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts index 9caaab478c..b99850a0ab 100644 --- a/backend/src/routes/api/accelerators/acceleratorUtils.ts +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -33,23 +33,23 @@ export const getAcceleratorNumbers = async ( // update the max count for each accelerator Object.entries(allocatable).forEach( - ([key, value]) => (info.available[key] = Math.max(info.available[key] || 0, value)), + ([key, value]) => (info.available[key] = Math.max(info.available[key] ?? 0, value)), ); // update the total count for each accelerator Object.entries(capacity).forEach( - ([key, value]) => (info.total[key] = (info.total[key] || 0) + value), + ([key, value]) => (info.total[key] = (info.total[key] ?? 0) + value), ); // update the allocated count for each accelerator Object.entries(capacity).forEach( ([key, value]) => - (info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0)), + (info.allocated[key] = (info.allocated[key] ?? 0) + value - (allocatable[key] ?? 0)), ); // if any accelerators are available, the cluster is configured const configured = - info.configured || Object.values(info.available).some((value) => value > 0); + info.configured ?? Object.values(info.available).some((value) => value > 0); return { total: info.total, @@ -63,7 +63,9 @@ export const getAcceleratorNumbers = async ( ) .catch((e) => { fastify.log.error( - `Exception when listing cluster nodes: ${e.response?.body?.message || e.message || e}`, + `A ${e.statusCode} error occurred when listing cluster nodes: ${ + e.response?.body?.message || e.statusMessage + }`, ); return { configured: false, available: {}, total: {}, allocated: {} }; }); diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index 3da0583a9d..567b7c9688 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -645,12 +645,17 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => .readNamespacedConfigMap(CONFIG_MAP_NAME, fastify.kube.namespace) .then(() => { // Found configmap, not continuing + fastify.log.info(`GPU migration already completed, skipping`); return false; }) .catch((e) => { if (e.statusCode === 404) { // No config saying we have already migrated gpus, continue return true; + } else { + throw `fetching gpu migration configmap had a ${e.statusCode} error: ${ + e.response?.body?.message || e?.response?.statusMessage + }`; } }); @@ -664,8 +669,11 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => 'acceleratorprofiles', ) .catch((e) => { - // If 404 shows up — CRD may not be installed, exit early - throw { message: 'Unable to fetch accelerator profiles: ' + e.toString() }; + console.log(e); + // If error shows up — CRD may not be installed, exit early + throw `A ${e.statusCode} error occurred when trying to fetch accelerator profiles: ${ + e.response?.body?.message || e?.response?.statusMessage + }`; }); const acceleratorProfiles = ( @@ -715,7 +723,11 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => ); } catch (e) { // If bad detection — exit early and dont create config - throw { message: 'Unable to add migrated-gpu accelerator profile: ' + e.toString() }; + throw `A ${ + e.statusCode + } error occurred when trying to add migrated-gpu accelerator profile: ${ + e.response?.body?.message || e?.response?.statusMessage + }`; } } } @@ -735,10 +747,9 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => .createNamespacedConfigMap(fastify.kube.namespace, configMap) .then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles')) .catch((e) => { - throw createCustomError( - 'Unable to create gpu migration configmap', - e.response?.body?.message || e.message, - ); + throw `A ${e.statusCode} error occurred when trying to create gpu migration configmap: ${ + e.response?.body?.message || e?.response?.statusMessage + }`; }); } }; diff --git a/frontend/src/components/SimpleDropdownSelect.tsx b/frontend/src/components/SimpleDropdownSelect.tsx index 9e76df5f4a..fad00f220d 100644 --- a/frontend/src/components/SimpleDropdownSelect.tsx +++ b/frontend/src/components/SimpleDropdownSelect.tsx @@ -58,7 +58,7 @@ const SimpleDropdownSelect: React.FC = ({ setOpen(false); }} > - {isPlaceholder ? {label} : label} + {label} ))} /> diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx index 7c83de4ddd..53d3099e33 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeDetails.tsx @@ -47,13 +47,19 @@ const ServingRuntimeDetails: React.FC = ({ obj }) => Accelerator - {accelerator.accelerator?.spec.displayName || 'unknown'} + {accelerator.accelerator + ? accelerator.accelerator.spec.displayName + : accelerator.useExisting + ? 'Unknown' + : 'None'} - - Number of accelerators - {accelerator.count} - + {!accelerator.useExisting && ( + + Number of accelerators + {accelerator.count} + + )} ); }; diff --git a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx index f05b359335..54079330e2 100644 --- a/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx +++ b/frontend/src/pages/modelServing/screens/projects/ServingRuntimeModal/ServingRuntimeSizeSection.tsx @@ -107,7 +107,7 @@ const ServingRuntimeSizeSection: React.FC = ({ acceleratorState={acceleratorState} setAcceleratorState={setAcceleratorState} supportedAccelerators={supportedAccelerators} - supportedText="Compatible with serving runtime" + resourceDisplayName="serving runtime" /> )} diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx index 13580197f8..36096a998b 100644 --- a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -22,14 +22,14 @@ type AcceleratorSelectFieldProps = { acceleratorState: AcceleratorState; setAcceleratorState: UpdateObjectAtPropAndValue; supportedAccelerators?: string[]; - supportedText?: string; + resourceDisplayName?: string; }; const AcceleratorSelectField: React.FC = ({ acceleratorState, setAcceleratorState, supportedAccelerators, - supportedText, + resourceDisplayName = 'image', }) => { const [detectedAcceleratorInfo] = useAcceleratorCounts(); @@ -83,7 +83,7 @@ const AcceleratorSelectField: React.FC = ({ {isAcceleratorSupported(ac) && ( - + )} @@ -109,13 +109,12 @@ const AcceleratorSelectField: React.FC = ({ if (accelerator && supportedAccelerators !== undefined) { if (supportedAccelerators?.length === 0) { acceleratorAlertMessage = { - title: - "The image you have selected doesn't support the selected accelerator. It is recommended to use a compatible image for optimal performance.", + title: `The ${resourceDisplayName} you have selected doesn't support the selected accelerator. It is recommended to use a compatible ${resourceDisplayName} for optimal performance.`, variant: AlertVariant.info, }; } else if (!isAcceleratorSupported(accelerator)) { acceleratorAlertMessage = { - title: 'The image you have selected is not compatible with the selected accelerator', + title: `The ${resourceDisplayName} you have selected is not compatible with the selected accelerator`, variant: AlertVariant.warning, }; } @@ -139,7 +138,7 @@ const AcceleratorSelectField: React.FC = ({ } const onStep = (step: number) => { - setAcceleratorState('count', Math.max(acceleratorCount + step, 0)); + setAcceleratorState('count', Math.max(acceleratorCount + step, 1)); }; // if there is more than a none option, show the dropdown @@ -168,6 +167,7 @@ const AcceleratorSelectField: React.FC = ({ setAcceleratorState('count', 0); } else { // normal flow + setAcceleratorState('count', 1); setAcceleratorState('useExisting', false); setAcceleratorState( 'accelerator', @@ -198,13 +198,13 @@ const AcceleratorSelectField: React.FC = ({ name="number-of-accelerators" value={acceleratorCount} validated={acceleratorCountWarning ? 'warning' : 'default'} - min={0} + min={1} onPlus={() => onStep(1)} onMinus={() => onStep(-1)} onChange={(event) => { if (isHTMLInputElement(event.target)) { const newSize = Number(event.target.value); - setAcceleratorState('count', newSize); + setAcceleratorState('count', Math.max(newSize, 1)); } }} /> diff --git a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx index 1dfb12a76a..edd6ebc690 100644 --- a/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx +++ b/frontend/src/pages/notebookController/screens/server/NotebookServerDetails.tsx @@ -108,13 +108,19 @@ const NotebookServerDetails: React.FC = () => { Accelerator - {accelerator.accelerator?.spec.displayName || 'unknown'} + {accelerator.accelerator + ? accelerator.accelerator.spec.displayName + : accelerator.useExisting + ? 'Unknown' + : 'None'} - - Number of accelerators - {accelerator.count} - + {!accelerator.useExisting && ( + + Number of accelerators + {accelerator.count} + + )} ); diff --git a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx index 48be7f4342..a439c42d49 100644 --- a/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx +++ b/frontend/src/pages/notebookController/screens/server/SpawnerPage.tsx @@ -232,8 +232,12 @@ const SpawnerPage: React.FC = () => { const fireStartServerEvent = () => { fireTrackingEvent('Notebook Server Started', { - accelerator: accelerator.accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', - acceleratorCount: accelerator.count, + accelerator: accelerator.accelerator + ? `${accelerator.accelerator.spec.displayName} (${accelerator.accelerator.metadata.name}): ${accelerator.accelerator.spec.identifier}` + : accelerator.useExisting + ? 'Unknown' + : 'None', + acceleratorCount: accelerator.useExisting ? undefined : accelerator.count, lastSelectedSize: selectedSize.name, lastSelectedImage: `${selectedImageTag.image?.name}:${selectedImageTag.tag?.name}`, }); diff --git a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx index c950fdbbd9..912777edb6 100644 --- a/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx +++ b/frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx @@ -51,10 +51,12 @@ const NotebookStatusToggle: React.FC = ({ const fireNotebookTrackingEvent = React.useCallback( (action: 'started' | 'stopped') => { fireTrackingEvent(`Workbench ${action}`, { - acceleratorCount: acceleratorData.count, + acceleratorCount: acceleratorData.useExisting ? undefined : acceleratorData.count, accelerator: acceleratorData.accelerator - ? JSON.stringify(acceleratorData.accelerator) - : 'unknown', + ? `${acceleratorData.accelerator.spec.displayName} (${acceleratorData.accelerator.metadata.name}): ${acceleratorData.accelerator.spec.identifier}` + : acceleratorData.useExisting + ? 'Unknown' + : 'None', lastSelectedSize: size?.name || notebook.metadata.annotations?.['notebooks.opendatahub.io/last-size-selection'], diff --git a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx index af0b093e99..ab93e4d235 100644 --- a/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx +++ b/frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx @@ -80,8 +80,12 @@ const SpawnerFooter: React.FC = ({ const afterStart = (name: string, type: 'created' | 'updated') => { const { accelerator, notebookSize, image } = startNotebookData; fireTrackingEvent(`Workbench ${type}`, { - acceleratorCount: accelerator.count, - accelerator: accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown', + acceleratorCount: accelerator.useExisting ? undefined : accelerator.count, + accelerator: accelerator.accelerator + ? `${accelerator.accelerator.spec.displayName} (${accelerator.accelerator.metadata.name}): ${accelerator.accelerator.spec.identifier}` + : accelerator.useExisting + ? 'Unknown' + : 'None', lastSelectedSize: notebookSize.name, lastSelectedImage: image.imageVersion?.from ? `${image.imageVersion.from.name}` diff --git a/frontend/src/utilities/useAcceleratorState.ts b/frontend/src/utilities/useAcceleratorState.ts index 759a61691a..83a55cce42 100644 --- a/frontend/src/utilities/useAcceleratorState.ts +++ b/frontend/src/utilities/useAcceleratorState.ts @@ -57,19 +57,18 @@ const useAcceleratorState = ( // check if there is accelerator usage in the container // this is to handle the case where the accelerator is disabled, deleted, or empty const containerResourceAttributes = Object.values(ContainerResourceAttributes) as string[]; - const possibleAcceleratorsIdentifiers = Object.entries(resources.requests ?? {}).filter( - ([key]) => !containerResourceAttributes.includes(key), - ); - if (possibleAcceleratorsIdentifiers.length > 0) { + const possibleAcceleratorRequests = Object.entries(resources.requests ?? {}) + .filter(([key]) => !containerResourceAttributes.includes(key)) + .map(([key, value]) => ({ identifier: key, count: value })); + if (possibleAcceleratorRequests.length > 0) { // check if they are just using the nvidia.com/gpu // if so, lets migrate them over to using the migrated-gpu accelerator profile if it exists - const acceleratorRequest = possibleAcceleratorsIdentifiers.find( - (possibleAcceleratorIdentifiers) => - possibleAcceleratorIdentifiers[0] === 'nvidia.com/gpu', + const nvidiaAcceleratorRequests = possibleAcceleratorRequests.find( + (request) => request.identifier === 'nvidia.com/gpu', ); if ( - acceleratorRequest && + nvidiaAcceleratorRequests && tolerations?.some( (toleration) => toleration.key === 'nvidia.com/gpu' && @@ -84,7 +83,7 @@ const useAcceleratorState = ( if (migratedAccelerator) { setData('accelerator', migratedAccelerator); setData('initialAccelerator', migratedAccelerator); - setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0)); + setData('count', Number(nvidiaAcceleratorRequests.count ?? 0)); if (!migratedAccelerator.spec.enabled) { setData('additionalOptions', { useDisabled: accelerator }); } @@ -113,7 +112,7 @@ const useAcceleratorState = ( setData('accelerator', fakeAccelerator); setData('accelerators', [fakeAccelerator, ...accelerators]); setData('initialAccelerator', fakeAccelerator); - setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0)); + setData('count', Number(nvidiaAcceleratorRequests.count ?? 0)); } } else { // fallback to using the existing accelerator From e529890c99c860d7e9b46def8e54ee8a721d7f0b Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Tue, 26 Sep 2023 09:30:36 -0500 Subject: [PATCH 20/22] fix detection logic --- backend/src/routes/api/accelerators/acceleratorUtils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts index b99850a0ab..d80b12b42a 100644 --- a/backend/src/routes/api/accelerators/acceleratorUtils.ts +++ b/backend/src/routes/api/accelerators/acceleratorUtils.ts @@ -49,7 +49,7 @@ export const getAcceleratorNumbers = async ( // if any accelerators are available, the cluster is configured const configured = - info.configured ?? Object.values(info.available).some((value) => value > 0); + info.configured || Object.values(info.available).some((value) => value > 0); return { total: info.total, From 548d78192abf3680da75318f880db026de038c87 Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 28 Sep 2023 08:21:29 -0500 Subject: [PATCH 21/22] update cluster role --- manifests/base/cluster-role.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml index d21d47cd04..c418434a59 100644 --- a/manifests/base/cluster-role.yaml +++ b/manifests/base/cluster-role.yaml @@ -3,6 +3,14 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: odh-dashboard rules: + - verbs: + - create + - get + - list + apiGroups: + - dashboard.opendatahub.io + resources: + - acceleratorprofiles - verbs: - get - list From ab90061339893d0fb85239306d78cd7a793945fe Mon Sep 17 00:00:00 2001 From: Gage Krumbach Date: Thu, 28 Sep 2023 10:57:48 -0500 Subject: [PATCH 22/22] move from cluster role to role --- manifests/base/cluster-role.yaml | 8 -------- manifests/base/role.yaml | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml index c418434a59..d21d47cd04 100644 --- a/manifests/base/cluster-role.yaml +++ b/manifests/base/cluster-role.yaml @@ -3,14 +3,6 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: odh-dashboard rules: - - verbs: - - create - - get - - list - apiGroups: - - dashboard.opendatahub.io - resources: - - acceleratorprofiles - verbs: - get - list diff --git a/manifests/base/role.yaml b/manifests/base/role.yaml index bac744a29e..5a885ab041 100644 --- a/manifests/base/role.yaml +++ b/manifests/base/role.yaml @@ -3,6 +3,14 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: odh-dashboard rules: + - verbs: + - create + - get + - list + apiGroups: + - dashboard.opendatahub.io + resources: + - acceleratorprofiles - apiGroups: - route.openshift.io resources: