opendatahub-io · openshift-merge-robot · Sep 29, 2023 · Jul 3, 2023 · Jul 20, 2023 · Jul 26, 2023
diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts
@@ -4,7 +4,7 @@ import { FastifyInstance } from 'fastify';
 import * as jsYaml from 'js-yaml';
 import * as k8s from '@kubernetes/client-node';
 import { DEV_MODE } from '../utils/constants';
-import { cleanupDSPSuffix, initializeWatchedResources } from '../utils/resourceUtils';
+import { cleanupGPU, initializeWatchedResources } from '../utils/resourceUtils';
 import { User } from '@kubernetes/client-node/dist/config_types';
 
 const CONSOLE_CONFIG_YAML_FIELD = 'console-config.yaml';
@@ -85,12 +85,10 @@ export default fp(async (fastify: FastifyInstance) => {
   // Initialize the watching of resources
   initializeWatchedResources(fastify);
 
-  // TODO: Delete this code in the future once we have no customers using RHODS 1.19 / ODH 2.4.0
-  // Cleanup for display name suffix of [DSP]
-  cleanupDSPSuffix(fastify).catch((e) =>
+  cleanupGPU(fastify).catch((e) =>
     fastify.log.error(
-      `Unable to fully cleanup project display name suffixes - Some projects may not appear in the dashboard UI. ${
-        e.response?.body?.message || e.message
+      `Unable to fully convert GPU to use accelerator profiles. ${
+        e.response?.body?.message || e.message || e
       }`,
     ),
   );

diff --git a/backend/src/routes/api/accelerators/acceleratorUtils.ts b/backend/src/routes/api/accelerators/acceleratorUtils.ts
@@ -0,0 +1,71 @@
+import { AcceleratorInfo, KubeFastifyInstance } from '../../../types';
+
+const RESOURCE_TYPES = [
+  'cpu',
+  'memory',
+  'pods',
+  'ephemeral-storage',
+  'hugepages-1Gi',
+  'hugepages-2Mi',
+  'attachable-volumes-aws-ebs',
+];
+
+const getIdentifiersFromResources = (resources: { [key: string]: string } = {}) => {
+  return Object.entries(resources)
+    .filter(([key]) => !RESOURCE_TYPES.includes(key))
+    .reduce<{ [key: string]: number }>((identifiers, [key, value]) => {
+      identifiers[key] = isNaN(parseInt(value)) ? 0 : parseInt(value);
+      return identifiers;
+    }, {});
+};
+
+export const getAcceleratorNumbers = async (
+  fastify: KubeFastifyInstance,
+): Promise<AcceleratorInfo> =>
+  fastify.kube.coreV1Api
+    .listNode()
+    .then((res) =>
+      res.body.items.reduce<AcceleratorInfo>(
+        (info, node) => {
+          // reduce resources down to just the accelerators and their counts
+          const allocatable = getIdentifiersFromResources(node.status.allocatable);
+          const capacity = getIdentifiersFromResources(node.status.capacity);
+
+          // update the max count for each accelerator
+          Object.entries(allocatable).forEach(
+            ([key, value]) => (info.available[key] = Math.max(info.available[key] ?? 0, value)),
+          );
+
+          // update the total count for each accelerator
+          Object.entries(capacity).forEach(
+            ([key, value]) => (info.total[key] = (info.total[key] ?? 0) + value),
+          );
+
+          // update the allocated count for each accelerator
+          Object.entries(capacity).forEach(
+            ([key, value]) =>
+              (info.allocated[key] = (info.allocated[key] ?? 0) + value - (allocatable[key] ?? 0)),
+          );
+
+          // if any accelerators are available, the cluster is configured
+          const configured =
+            info.configured || Object.values(info.available).some((value) => value > 0);
+
+          return {
+            total: info.total,
+            available: info.available,
+            allocated: info.allocated,
+            configured,
+          };
+        },
+        { configured: false, available: {}, total: {}, allocated: {} },
+      ),
+    )
+    .catch((e) => {
+      fastify.log.error(
+        `A ${e.statusCode} error occurred when listing cluster nodes: ${
+          e.response?.body?.message || e.statusMessage
+        }`,
+      );
+      return { configured: false, available: {}, total: {}, allocated: {} };
+    });
diff --git a/backend/src/routes/api/accelerators/index.ts b/backend/src/routes/api/accelerators/index.ts
@@ -0,0 +1,11 @@
+import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types';
+import { getAcceleratorNumbers } from './acceleratorUtils';
+import { logRequestDetails } from '../../../utils/fileUtils';
+
+export default async (fastify: KubeFastifyInstance): Promise<void> => {
+  fastify.get('/', async (request: OauthFastifyRequest) => {
+    logRequestDetails(fastify, request);
+
+    return getAcceleratorNumbers(fastify);
+  });
+};
diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts
@@ -16,6 +16,9 @@ const storage: { lastFetch: number; lastValue: GPUInfo } = {
   lastFetch: 0,
 };
 
+/**
+ * @deprecated - use getAcceleratorNumbers instead
+ */
 export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInfo> => {
   if (storage.lastFetch >= Date.now() - 30_000) {
     fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`);
@@ -67,11 +70,15 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInf
     available: maxGpuNumber,
     autoscalers: scalingLimit,
   };
+
   storage.lastFetch = Date.now();
   storage.lastValue = data;
   return data;
 };
 
+/**
+ * @deprecated
+ */
 export const getGPUData = async (
   fastify: KubeFastifyInstance,
   podIP: string,

diff --git a/backend/src/routes/api/gpu/index.ts b/backend/src/routes/api/gpu/index.ts
@@ -2,6 +2,9 @@ import { KubeFastifyInstance, OauthFastifyRequest } from '../../../types';
 import { getGPUNumber } from './gpuUtils';
 import { logRequestDetails } from '../../../utils/fileUtils';
 
+/**
+ * @deprecated - use accelerators instead
+ */
 export default async (fastify: KubeFastifyInstance): Promise<void> => {
   fastify.get('/', async (request: OauthFastifyRequest) => {
     logRequestDetails(fastify, request);

diff --git a/backend/src/types.ts b/backend/src/types.ts
@@ -254,6 +254,7 @@ export type KubeDecorator = KubeStatus & {
   customObjectsApi: k8s.CustomObjectsApi;
   rbac: k8s.RbacAuthorizationV1Api;
   currentToken: string;
+
 };
 
 export type KubeFastifyInstance = FastifyInstance & {
@@ -755,6 +756,14 @@ export type GPUInfo = {
   available: number;
   autoscalers: gpuScale[];
 };
+
+export type AcceleratorInfo = {
+  configured: boolean;
+  available: {[key: string]: number};
+  total: {[key: string]: number};
+  allocated: {[key: string]: number};
+}
+
 export type EnvironmentVariable = EitherNotBoth<
   { value: string | number },
   { valueFrom: Record<string, unknown> }
@@ -805,12 +814,17 @@ export type NotebookData = {
   notebookSizeName: string;
   imageName: string;
   imageTagName: string;
-  gpus: number;
+  accelerator: AcceleratorState;
   envVars: EnvVarReducedTypeKeyValues;
   state: NotebookState;
   username?: string;
 };
 
+export type AcceleratorState = {
+  accelerator?: AcceleratorKind;
+  count: number;
+};
+
 export const LIMIT_NOTEBOOK_IMAGE_GPU = 'nvidia.com/gpu';
 
 type DisplayNameAnnotations = Partial<{
@@ -868,19 +882,21 @@ export type SupportedModelFormats = {
   autoSelect?: boolean;
 };
 
-export type GPUCount = string | number;
+
+export enum ContainerResourceAttributes {
+  CPU = 'cpu',
+  MEMORY = 'memory',
+}
 
 export type ContainerResources = {
   requests?: {
     cpu?: string | number;
     memory?: string;
-    'nvidia.com/gpu'?: GPUCount;
-  };
+  } & Record<string, unknown>;
   limits?: {
     cpu?: string | number;
     memory?: string;
-    'nvidia.com/gpu'?: GPUCount;
-  };
+  } & Record<string, unknown>;
 };
 
 export type ServingRuntime = K8sResourceCommon & {
@@ -908,3 +924,26 @@ export type ServingRuntime = K8sResourceCommon & {
     volumes?: Volume[];
   };
 };
+
+export type AcceleratorKind = K8sResourceCommon & {
+  metadata: {
+    name: string;
+    annotations?: Partial<{
+      'opendatahub.io/modified-date': string;
+    }>;
+  };
+  spec: {
+    displayName: string;
+    enabled: boolean;
+    identifier: string;
+    description?: string;
+    tolerations?: NotebookToleration[];
+  };
+};
+
+export enum KnownLabels {
+  DASHBOARD_RESOURCE = 'opendatahub.io/dashboard',
+  PROJECT_SHARING = 'opendatahub.io/project-sharing',
+  MODEL_SERVING_PROJECT = 'modelmesh-enabled',
+  DATA_CONNECTION_AWS = 'opendatahub.io/managed',
+}
diff --git a/backend/src/utils/constants.ts b/backend/src/utils/constants.ts
@@ -1,6 +1,6 @@
 import * as path from 'path';
 import './dotenv';
-import { DashboardConfig, NotebookSize } from '../types';
+import { DashboardConfig, KnownLabels, NotebookSize } from '../types';
 
 export const PORT = Number(process.env.PORT) || Number(process.env.BACKEND_PORT) || 8080;
 export const IP = process.env.IP || '0.0.0.0';
@@ -134,3 +134,5 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [
 
 export const imageUrlRegex =
   /^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/;
+
+export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`;
diff --git a/backend/src/utils/notebookUtils.ts b/backend/src/utils/notebookUtils.ts
@@ -1,10 +1,10 @@
 import { getDashboardConfig } from './resourceUtils';
 import {
+  ContainerResourceAttributes,
   EnvironmentVariable,
   ImageInfo,
   ImageTag,
   KubeFastifyInstance,
-  LIMIT_NOTEBOOK_IMAGE_GPU,
   Notebook,
   NotebookAffinity,
   NotebookData,
@@ -156,7 +156,7 @@ export const assembleNotebook = async (
   envName: string,
   tolerationSettings: NotebookTolerationSettings,
 ): Promise<Notebook> => {
-  const { notebookSizeName, imageName, imageTagName, gpus, envVars } = data;
+  const { notebookSizeName, imageName, imageTagName, accelerator, envVars } = data;
 
   const notebookSize = getNotebookSize(notebookSizeName);
 
@@ -191,40 +191,35 @@ export const assembleNotebook = async (
   const resources: NotebookResources = { ...notebookSize.resources };
   const tolerations: NotebookToleration[] = [];
 
-  let affinity: NotebookAffinity = {};
-  if (gpus > 0) {
+  const affinity: NotebookAffinity = {};
+  if (accelerator.count > 0 && accelerator.accelerator) {
     if (!resources.limits) {
       resources.limits = {};
     }
     if (!resources.requests) {
       resources.requests = {};
     }
-    resources.limits[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
-    resources.requests[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
-    tolerations.push({
-      effect: 'NoSchedule',
-      key: LIMIT_NOTEBOOK_IMAGE_GPU,
-      operator: 'Exists',
-    });
+    resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count;
+    resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count;
   } else {
-    affinity = {
-      nodeAffinity: {
-        preferredDuringSchedulingIgnoredDuringExecution: [
-          {
-            preference: {
-              matchExpressions: [
-                {
-                  key: 'nvidia.com/gpu.present',
-                  operator: 'NotIn',
-                  values: ['true'],
-                },
-              ],
-            },
-            weight: 1,
-          },
-        ],
-      },
-    };
+    // step type down to string to avoid type errors
+    const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes);
+
+    Object.keys(resources.limits || {}).forEach((key) => {
+      if (!containerResourceKeys.includes(key)) {
+        delete resources.limits?.[key];
+      }
+    });
+
+    Object.keys(resources.requests || {}).forEach((key) => {
+      if (!containerResourceKeys.includes(key)) {
+        delete resources.requests?.[key];
+      }
+    });
+  }
+
+  if (accelerator.accelerator?.spec.tolerations) {
+    tolerations.push(...accelerator.accelerator.spec.tolerations);
   }
 
   if (tolerationSettings?.enabled) {
@@ -272,6 +267,7 @@ export const assembleNotebook = async (
         'notebooks.opendatahub.io/last-image-selection': imageSelection,
         'opendatahub.io/username': username,
         'kubeflow-resource-stopped': null,
+        'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '',
       },
       name: name,
       namespace: namespace,