diff --git a/backend/src/types.ts b/backend/src/types.ts index fecdc53fff..1674dccf37 100644 --- a/backend/src/types.ts +++ b/backend/src/types.ts @@ -247,6 +247,7 @@ export type KubeDecorator = KubeStatus & { customObjectsApi: k8s.CustomObjectsApi; rbac: k8s.RbacAuthorizationV1Api; currentToken: string; + }; export type KubeFastifyInstance = FastifyInstance & { @@ -806,12 +807,17 @@ export type NotebookData = { notebookSizeName: string; imageName: string; imageTagName: string; - gpus: number; + accelerator: AcceleratorState; envVars: EnvVarReducedTypeKeyValues; state: NotebookState; username?: string; }; +export type AcceleratorState = { + accelerator?: AcceleratorKind; + count: number; +}; + export const LIMIT_NOTEBOOK_IMAGE_GPU = 'nvidia.com/gpu'; type DisplayNameAnnotations = Partial<{ @@ -863,18 +869,20 @@ export type SupportedModelFormats = { autoSelect?: boolean; }; -export type GPUCount = string | number; + +export enum ContainerResourceAttributes { + CPU = 'cpu', + MEMORY = 'memory', +} export type ContainerResources = { requests?: { cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; limits?: { cpu?: string; memory?: string; - 'nvidia.com/gpu'?: GPUCount; }; }; @@ -917,4 +925,11 @@ export type AcceleratorKind = K8sResourceCommon & { description?: string; tolerations?: NotebookToleration[]; }; -}; \ No newline at end of file +}; + +export enum KnownLabels { + DASHBOARD_RESOURCE = 'opendatahub.io/dashboard', + PROJECT_SHARING = 'opendatahub.io/project-sharing', + MODEL_SERVING_PROJECT = 'modelmesh-enabled', + DATA_CONNECTION_AWS = 'opendatahub.io/managed', +} \ No newline at end of file diff --git a/backend/src/utils/constants.ts b/backend/src/utils/constants.ts index 2ac758bde0..b69bae89d2 100644 --- a/backend/src/utils/constants.ts +++ b/backend/src/utils/constants.ts @@ -1,6 +1,6 @@ import * as path from 'path'; import './dotenv'; -import { DashboardConfig, NotebookSize } from '../types'; +import { DashboardConfig, KnownLabels, NotebookSize } from '../types'; export const PORT = Number(process.env.PORT) || Number(process.env.BACKEND_PORT) || 8080; export const IP = process.env.IP || '0.0.0.0'; @@ -133,3 +133,5 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [ export const imageUrlRegex = /^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/; + + export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`; diff --git a/backend/src/utils/notebookUtils.ts b/backend/src/utils/notebookUtils.ts index 32512311bd..f87bceddf9 100644 --- a/backend/src/utils/notebookUtils.ts +++ b/backend/src/utils/notebookUtils.ts @@ -1,10 +1,10 @@ import { getDashboardConfig } from './resourceUtils'; import { + ContainerResourceAttributes, EnvironmentVariable, ImageInfo, ImageTag, KubeFastifyInstance, - LIMIT_NOTEBOOK_IMAGE_GPU, Notebook, NotebookAffinity, NotebookData, @@ -156,7 +156,7 @@ export const assembleNotebook = async ( envName: string, tolerationSettings: NotebookTolerationSettings, ): Promise => { - const { notebookSizeName, imageName, imageTagName, gpus, envVars } = data; + const { notebookSizeName, imageName, imageTagName, accelerator, envVars } = data; const notebookSize = getNotebookSize(notebookSizeName); @@ -186,39 +186,34 @@ export const assembleNotebook = async ( const tolerations: NotebookToleration[] = []; let affinity: NotebookAffinity = {}; - if (gpus > 0) { + if (accelerator.count > 0 && accelerator.accelerator) { if (!resources.limits) { resources.limits = {}; } if (!resources.requests) { resources.requests = {}; } - resources.limits[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus; - resources.requests[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus; - tolerations.push({ - effect: 'NoSchedule', - key: LIMIT_NOTEBOOK_IMAGE_GPU, - operator: 'Exists', - }); + resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count; + resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { - affinity = { - nodeAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - { - preference: { - matchExpressions: [ - { - key: 'nvidia.com/gpu.present', - operator: 'NotIn', - values: ['true'], - }, - ], - }, - weight: 1, - }, - ], - }, - }; + // step type down to string to avoid type errors + const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes); + + Object.keys(resources.limits || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.limits?.[key]; + } + }); + + Object.keys(resources.requests || {}).forEach((key) => { + if (!containerResourceKeys.includes(key)) { + delete resources.requests?.[key]; + } + }); + } + + if (accelerator.accelerator?.spec.tolerations) { + tolerations.push(...accelerator.accelerator.spec.tolerations); } if (tolerationSettings?.enabled) { @@ -266,6 +261,7 @@ export const assembleNotebook = async ( 'notebooks.opendatahub.io/last-image-selection': imageSelection, 'opendatahub.io/username': username, 'kubeflow-resource-stopped': null, + 'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '', }, name: name, namespace: namespace, diff --git a/backend/src/utils/resourceUtils.ts b/backend/src/utils/resourceUtils.ts index b5d83ac450..40101f1e82 100644 --- a/backend/src/utils/resourceUtils.ts +++ b/backend/src/utils/resourceUtils.ts @@ -33,6 +33,7 @@ import { } from './componentUtils'; import { createCustomError } from './requestUtils'; import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils'; +import { getNotebooks } from './notebookUtils'; const dashboardConfigMapName = 'odh-dashboard-config'; const consoleLinksGroup = 'console.openshift.io'; @@ -678,7 +679,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => }; try { - await await fastify.kube.customObjectsApi.createNamespacedCustomObject( + await fastify.kube.customObjectsApi.createNamespacedCustomObject( 'dashboard.opendatahub.io', 'v1alpha', fastify.kube.namespace, @@ -688,7 +689,34 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise => } catch (e) { // If bad detection — exit early and dont create config throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString() - } + } + + // update already running notebooks to use the new profile + const notebooks = await getNotebooks(fastify, fastify.kube.namespace) + notebooks.items.forEach(async (notebook) => { + const gpuCount = notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu'] + if (gpuCount) { + notebook.metadata.annotations = { + ...notebook.metadata.annotations, + 'opendatahub.io/recommended-accelerators' : 'migrated-gpu' + } + await fastify.kube.customObjectsApi.patchNamespacedCustomObject( + 'kubeflow.org', + 'v1', + fastify.kube.namespace, + 'notebooks', + notebook.metadata.name, + notebook, + undefined, + undefined, + undefined, + { + headers: { 'Content-type': PatchUtils.PATCH_FORMAT_JSON_MERGE_PATCH }, + }, + ) + } + } + ) }; } diff --git a/frontend/src/api/k8s/utils.ts b/frontend/src/api/k8s/utils.ts index 883df66e5c..0b9498749f 100644 --- a/frontend/src/api/k8s/utils.ts +++ b/frontend/src/api/k8s/utils.ts @@ -31,7 +31,7 @@ export const assemblePodSpecOptions = ( resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count; } else { // step type down to string to avoid type errors - const containerResourceKeys: string[] = Object.keys(ContainerResourceAttributes); + const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes); Object.keys(resources.limits || {}).forEach((key) => { if (!containerResourceKeys.includes(key)) { diff --git a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx index dcb0d45f62..003dc0412c 100644 --- a/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx +++ b/frontend/src/pages/notebookController/screens/server/AcceleratorSelectField.tsx @@ -48,7 +48,7 @@ const AcceleratorSelectField: React.FC = ({ )?.[1]; if (detectedAcceleratorCount === undefined) { - return `No accelerator detected with the identifier ${accelerator?.spec.identifier} detected.`; + return `No accelerator detected with the identifier ${accelerator?.spec.identifier}.`; } else if (newSize > detectedAcceleratorCount) { return `Only ${detectedAcceleratorCount} accelerator${ detectedAcceleratorCount > 1 ? 's' : '' @@ -61,9 +61,7 @@ const AcceleratorSelectField: React.FC = ({ ); React.useEffect(() => { - if (acceleratorCount > 0) { - setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); - } + setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount)); }, [acceleratorCount, validateAcceleratorCount]); const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState( diff --git a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts index 1fcef275b0..57f3bbc048 100644 --- a/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts +++ b/frontend/src/pages/projects/screens/spawner/spawnerUtils.ts @@ -1,6 +1,6 @@ import * as React from 'react'; import compareVersions from 'compare-versions'; -import { BYONImage, K8sResourceCommon, NotebookSize, Volume, VolumeMount } from '~/types'; +import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types'; import { BuildKind, ImageStreamKind, @@ -414,7 +414,7 @@ export const isInvalidBYONImageStream = (imageStream: ImageStreamKind) => { ); }; -export const convertBYONImageToK8sResource = (image: BYONImage): K8sResourceCommon => ({ +export const convertBYONImageToK8sResource = (image: BYONImage) => ({ kind: 'ImageStream', apiVersion: 'image.openshift.io/v1', metadata: {