Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor accelerator fixes #1764

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/src/plugins/kube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export default fp(async (fastify: FastifyInstance) => {
cleanupGPU(fastify).catch((e) =>
fastify.log.error(
`Unable to fully convert GPU to use accelerator profiles. ${
e.response?.body?.message || e.message
e.response?.body?.message || e.message || e
}`,
),
);
Expand Down
12 changes: 7 additions & 5 deletions backend/src/routes/api/accelerators/acceleratorUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,23 @@ export const getAcceleratorNumbers = async (

// update the max count for each accelerator
Object.entries(allocatable).forEach(
([key, value]) => (info.available[key] = Math.max(info.available[key] || 0, value)),
([key, value]) => (info.available[key] = Math.max(info.available[key] ?? 0, value)),
);

// update the total count for each accelerator
Object.entries(capacity).forEach(
([key, value]) => (info.total[key] = (info.total[key] || 0) + value),
([key, value]) => (info.total[key] = (info.total[key] ?? 0) + value),
);

// update the allocated count for each accelerator
Object.entries(capacity).forEach(
([key, value]) =>
(info.allocated[key] = (info.allocated[key] || 0) + value - (allocatable[key] || 0)),
(info.allocated[key] = (info.allocated[key] ?? 0) + value - (allocatable[key] ?? 0)),
);

// if any accelerators are available, the cluster is configured
const configured =
info.configured || Object.values(info.available).some((value) => value > 0);
info.configured ?? Object.values(info.available).some((value) => value > 0);

return {
total: info.total,
Expand All @@ -63,7 +63,9 @@ export const getAcceleratorNumbers = async (
)
.catch((e) => {
fastify.log.error(
`Exception when listing cluster nodes: ${e.response?.body?.message || e.message || e}`,
`A ${e.statusCode} error occurred when listing cluster nodes: ${
e.response?.body?.message || e.statusMessage
}`,
);
return { configured: false, available: {}, total: {}, allocated: {} };
});
25 changes: 18 additions & 7 deletions backend/src/utils/resourceUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -645,12 +645,17 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
.readNamespacedConfigMap(CONFIG_MAP_NAME, fastify.kube.namespace)
.then(() => {
// Found configmap, not continuing
fastify.log.info(`GPU migration already completed, skipping`);
return false;
})
.catch((e) => {
if (e.statusCode === 404) {
// No config saying we have already migrated gpus, continue
return true;
} else {
throw `fetching gpu migration configmap had a ${e.statusCode} error: ${
e.response?.body?.message || e?.response?.statusMessage
}`;
}
});

Expand All @@ -664,8 +669,11 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
'acceleratorprofiles',
)
.catch((e) => {
// If 404 shows up — CRD may not be installed, exit early
throw { message: 'Unable to fetch accelerator profiles: ' + e.toString() };
console.log(e);
// If error shows up — CRD may not be installed, exit early
throw `A ${e.statusCode} error occurred when trying to fetch accelerator profiles: ${
e.response?.body?.message || e?.response?.statusMessage
}`;
});

const acceleratorProfiles = (
Expand Down Expand Up @@ -715,7 +723,11 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
);
} catch (e) {
// If bad detection — exit early and dont create config
throw { message: 'Unable to add migrated-gpu accelerator profile: ' + e.toString() };
throw `A ${
e.statusCode
} error occurred when trying to add migrated-gpu accelerator profile: ${
e.response?.body?.message || e?.response?.statusMessage
}`;
}
}
}
Expand All @@ -735,10 +747,9 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
.createNamespacedConfigMap(fastify.kube.namespace, configMap)
.then(() => fastify.log.info('Successfully migrated GPUs to accelerator profiles'))
.catch((e) => {
throw createCustomError(
'Unable to create gpu migration configmap',
e.response?.body?.message || e.message,
);
throw `A ${e.statusCode} error occurred when trying to create gpu migration configmap: ${
e.response?.body?.message || e?.response?.statusMessage
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example you are using or here

}`;
});
}
};
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/SimpleDropdownSelect.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ const SimpleDropdownSelect: React.FC<SimpleDropdownProps> = ({
setOpen(false);
}}
>
{isPlaceholder ? <i>{label}</i> : label}
{label}
</DropdownItem>
))}
/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,19 @@ const ServingRuntimeDetails: React.FC<ServingRuntimeDetailsProps> = ({ obj }) =>
<DescriptionListGroup>
<DescriptionListTerm>Accelerator</DescriptionListTerm>
<DescriptionListDescription>
{accelerator.accelerator?.spec.displayName || 'unknown'}
{accelerator.accelerator
? accelerator.accelerator.spec.displayName
: accelerator.useExisting
? 'Unknown'
: 'None'}
</DescriptionListDescription>
</DescriptionListGroup>
<DescriptionListGroup>
<DescriptionListTerm>Number of accelerators</DescriptionListTerm>
<DescriptionListDescription>{accelerator.count}</DescriptionListDescription>
</DescriptionListGroup>
{!accelerator.useExisting && (
<DescriptionListGroup>
<DescriptionListTerm>Number of accelerators</DescriptionListTerm>
<DescriptionListDescription>{accelerator.count}</DescriptionListDescription>
</DescriptionListGroup>
)}
</DescriptionList>
);
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ const ServingRuntimeSizeSection: React.FC<ServingRuntimeSizeSectionProps> = ({
acceleratorState={acceleratorState}
setAcceleratorState={setAcceleratorState}
supportedAccelerators={supportedAccelerators}
supportedText="Compatible with serving runtime"
resourceDisplayName="serving runtime"
/>
</FormGroup>
)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ type AcceleratorSelectFieldProps = {
acceleratorState: AcceleratorState;
setAcceleratorState: UpdateObjectAtPropAndValue<AcceleratorState>;
supportedAccelerators?: string[];
supportedText?: string;
resourceDisplayName?: string;
};

const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
acceleratorState,
setAcceleratorState,
supportedAccelerators,
supportedText,
resourceDisplayName = 'image',
}) => {
const [detectedAcceleratorInfo] = useAcceleratorCounts();

Expand Down Expand Up @@ -83,7 +83,7 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
<SplitItem isFilled />
<SplitItem>
{isAcceleratorSupported(ac) && (
<Label color="blue">{supportedText ?? 'Compatible with image'}</Label>
<Label color="blue">{`Compatible with ${resourceDisplayName}`}</Label>
)}
</SplitItem>
</Split>
Expand All @@ -109,13 +109,12 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
if (accelerator && supportedAccelerators !== undefined) {
if (supportedAccelerators?.length === 0) {
acceleratorAlertMessage = {
title:
"The image you have selected doesn't support the selected accelerator. It is recommended to use a compatible image for optimal performance.",
title: `The ${resourceDisplayName} you have selected doesn't support the selected accelerator. It is recommended to use a compatible ${resourceDisplayName} for optimal performance.`,
variant: AlertVariant.info,
};
} else if (!isAcceleratorSupported(accelerator)) {
acceleratorAlertMessage = {
title: 'The image you have selected is not compatible with the selected accelerator',
title: `The ${resourceDisplayName} you have selected is not compatible with the selected accelerator`,
variant: AlertVariant.warning,
};
}
Expand All @@ -139,7 +138,7 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
}

const onStep = (step: number) => {
setAcceleratorState('count', Math.max(acceleratorCount + step, 0));
setAcceleratorState('count', Math.max(acceleratorCount + step, 1));
};

// if there is more than a none option, show the dropdown
Expand Down Expand Up @@ -168,6 +167,7 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
setAcceleratorState('count', 0);
} else {
// normal flow
setAcceleratorState('count', 1);
setAcceleratorState('useExisting', false);
setAcceleratorState(
'accelerator',
Expand Down Expand Up @@ -198,13 +198,13 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
name="number-of-accelerators"
value={acceleratorCount}
validated={acceleratorCountWarning ? 'warning' : 'default'}
min={0}
min={1}
onPlus={() => onStep(1)}
onMinus={() => onStep(-1)}
onChange={(event) => {
if (isHTMLInputElement(event.target)) {
const newSize = Number(event.target.value);
setAcceleratorState('count', newSize);
setAcceleratorState('count', Math.max(newSize, 1));
}
}}
/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,19 @@ const NotebookServerDetails: React.FC = () => {
<DescriptionListGroup>
<DescriptionListTerm>Accelerator</DescriptionListTerm>
<DescriptionListDescription>
{accelerator.accelerator?.spec.displayName || 'unknown'}
{accelerator.accelerator
? accelerator.accelerator.spec.displayName
: accelerator.useExisting
? 'Unknown'
: 'None'}
</DescriptionListDescription>
</DescriptionListGroup>
<DescriptionListGroup>
<DescriptionListTerm>Number of accelerators</DescriptionListTerm>
<DescriptionListDescription>{accelerator.count}</DescriptionListDescription>
</DescriptionListGroup>
{!accelerator.useExisting && (
<DescriptionListGroup>
<DescriptionListTerm>Number of accelerators</DescriptionListTerm>
<DescriptionListDescription>{accelerator.count}</DescriptionListDescription>
</DescriptionListGroup>
)}
</DescriptionList>
</ExpandableSection>
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,12 @@ const SpawnerPage: React.FC = () => {

const fireStartServerEvent = () => {
fireTrackingEvent('Notebook Server Started', {
accelerator: accelerator.accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown',
acceleratorCount: accelerator.count,
accelerator: accelerator.accelerator
? `${accelerator.accelerator.spec.displayName} (${accelerator.accelerator.metadata.name}): ${accelerator.accelerator.spec.identifier}`
: accelerator.useExisting
? 'Unknown'
: 'None',
acceleratorCount: accelerator.useExisting ? undefined : accelerator.count,
lastSelectedSize: selectedSize.name,
lastSelectedImage: `${selectedImageTag.image?.name}:${selectedImageTag.tag?.name}`,
});
Expand Down
8 changes: 5 additions & 3 deletions frontend/src/pages/projects/notebook/NotebookStatusToggle.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ const NotebookStatusToggle: React.FC<NotebookStatusToggleProps> = ({
const fireNotebookTrackingEvent = React.useCallback(
(action: 'started' | 'stopped') => {
fireTrackingEvent(`Workbench ${action}`, {
acceleratorCount: acceleratorData.count,
acceleratorCount: acceleratorData.useExisting ? undefined : acceleratorData.count,
accelerator: acceleratorData.accelerator
? JSON.stringify(acceleratorData.accelerator)
: 'unknown',
? `${acceleratorData.accelerator.spec.displayName} (${acceleratorData.accelerator.metadata.name}): ${acceleratorData.accelerator.spec.identifier}`
: acceleratorData.useExisting
? 'Unknown'
: 'None',
lastSelectedSize:
size?.name ||
notebook.metadata.annotations?.['notebooks.opendatahub.io/last-size-selection'],
Expand Down
8 changes: 6 additions & 2 deletions frontend/src/pages/projects/screens/spawner/SpawnerFooter.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,12 @@ const SpawnerFooter: React.FC<SpawnerFooterProps> = ({
const afterStart = (name: string, type: 'created' | 'updated') => {
const { accelerator, notebookSize, image } = startNotebookData;
fireTrackingEvent(`Workbench ${type}`, {
acceleratorCount: accelerator.count,
accelerator: accelerator ? JSON.stringify(accelerator.accelerator) : 'unknown',
acceleratorCount: accelerator.useExisting ? undefined : accelerator.count,
accelerator: accelerator.accelerator
? `${accelerator.accelerator.spec.displayName} (${accelerator.accelerator.metadata.name}): ${accelerator.accelerator.spec.identifier}`
: accelerator.useExisting
? 'Unknown'
: 'None',
lastSelectedSize: notebookSize.name,
lastSelectedImage: image.imageVersion?.from
? `${image.imageVersion.from.name}`
Expand Down
19 changes: 9 additions & 10 deletions frontend/src/utilities/useAcceleratorState.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,18 @@ const useAcceleratorState = (
// check if there is accelerator usage in the container
// this is to handle the case where the accelerator is disabled, deleted, or empty
const containerResourceAttributes = Object.values(ContainerResourceAttributes) as string[];
const possibleAcceleratorsIdentifiers = Object.entries(resources.requests ?? {}).filter(
([key]) => !containerResourceAttributes.includes(key),
);
if (possibleAcceleratorsIdentifiers.length > 0) {
const possibleAcceleratorRequests = Object.entries(resources.requests ?? {})
.filter(([key]) => !containerResourceAttributes.includes(key))
.map(([key, value]) => ({ identifier: key, count: value }));
if (possibleAcceleratorRequests.length > 0) {
// check if they are just using the nvidia.com/gpu
// if so, lets migrate them over to using the migrated-gpu accelerator profile if it exists
const acceleratorRequest = possibleAcceleratorsIdentifiers.find(
(possibleAcceleratorIdentifiers) =>
possibleAcceleratorIdentifiers[0] === 'nvidia.com/gpu',
const nvidiaAcceleratorRequests = possibleAcceleratorRequests.find(
(request) => request.identifier === 'nvidia.com/gpu',
);

if (
acceleratorRequest &&
nvidiaAcceleratorRequests &&
tolerations?.some(
(toleration) =>
toleration.key === 'nvidia.com/gpu' &&
Expand All @@ -84,7 +83,7 @@ const useAcceleratorState = (
if (migratedAccelerator) {
setData('accelerator', migratedAccelerator);
setData('initialAccelerator', migratedAccelerator);
setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0));
setData('count', Number(nvidiaAcceleratorRequests.count ?? 0));
if (!migratedAccelerator.spec.enabled) {
setData('additionalOptions', { useDisabled: accelerator });
}
Expand Down Expand Up @@ -113,7 +112,7 @@ const useAcceleratorState = (
setData('accelerator', fakeAccelerator);
setData('accelerators', [fakeAccelerator, ...accelerators]);
setData('initialAccelerator', fakeAccelerator);
setData('count', Number(possibleAcceleratorsIdentifiers[0][1] ?? 0));
setData('count', Number(nvidiaAcceleratorRequests.count ?? 0));
}
} else {
// fallback to using the existing accelerator
Expand Down