Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug fixes #1645

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions backend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ export type KubeDecorator = KubeStatus & {
customObjectsApi: k8s.CustomObjectsApi;
rbac: k8s.RbacAuthorizationV1Api;
currentToken: string;

};

export type KubeFastifyInstance = FastifyInstance & {
Expand Down Expand Up @@ -806,12 +807,17 @@ export type NotebookData = {
notebookSizeName: string;
imageName: string;
imageTagName: string;
gpus: number;
accelerator: AcceleratorState;
envVars: EnvVarReducedTypeKeyValues;
state: NotebookState;
username?: string;
};

export type AcceleratorState = {
accelerator?: AcceleratorKind;
count: number;
};

export const LIMIT_NOTEBOOK_IMAGE_GPU = 'nvidia.com/gpu';

type DisplayNameAnnotations = Partial<{
Expand Down Expand Up @@ -863,18 +869,20 @@ export type SupportedModelFormats = {
autoSelect?: boolean;
};

export type GPUCount = string | number;

export enum ContainerResourceAttributes {
CPU = 'cpu',
MEMORY = 'memory',
}

export type ContainerResources = {
requests?: {
cpu?: string;
memory?: string;
'nvidia.com/gpu'?: GPUCount;
};
limits?: {
cpu?: string;
memory?: string;
'nvidia.com/gpu'?: GPUCount;
};
};

Expand Down Expand Up @@ -917,4 +925,11 @@ export type AcceleratorKind = K8sResourceCommon & {
description?: string;
tolerations?: NotebookToleration[];
};
};
};

export enum KnownLabels {
DASHBOARD_RESOURCE = 'opendatahub.io/dashboard',
PROJECT_SHARING = 'opendatahub.io/project-sharing',
MODEL_SERVING_PROJECT = 'modelmesh-enabled',
DATA_CONNECTION_AWS = 'opendatahub.io/managed',
}
4 changes: 3 additions & 1 deletion backend/src/utils/constants.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import * as path from 'path';
import './dotenv';
import { DashboardConfig, NotebookSize } from '../types';
import { DashboardConfig, KnownLabels, NotebookSize } from '../types';

export const PORT = Number(process.env.PORT) || Number(process.env.BACKEND_PORT) || 8080;
export const IP = process.env.IP || '0.0.0.0';
Expand Down Expand Up @@ -133,3 +133,5 @@ export const DEFAULT_NOTEBOOK_SIZES: NotebookSize[] = [

export const imageUrlRegex =
/^([\w.\-_]+((?::\d+|)(?=\/[a-z0-9._-]+\/[a-z0-9._-]+))|)(?:\/|)([a-z0-9.\-_]+(?:\/[a-z0-9.\-_]+|))(?::([\w.\-_]{1,127})|)/;

export const LABEL_SELECTOR_DASHBOARD_RESOURCE = `${KnownLabels.DASHBOARD_RESOURCE}=true`;
52 changes: 24 additions & 28 deletions backend/src/utils/notebookUtils.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import { getDashboardConfig } from './resourceUtils';
import {
ContainerResourceAttributes,
EnvironmentVariable,
ImageInfo,
ImageTag,
KubeFastifyInstance,
LIMIT_NOTEBOOK_IMAGE_GPU,
Notebook,
NotebookAffinity,
NotebookData,
Expand Down Expand Up @@ -156,7 +156,7 @@ export const assembleNotebook = async (
envName: string,
tolerationSettings: NotebookTolerationSettings,
): Promise<Notebook> => {
const { notebookSizeName, imageName, imageTagName, gpus, envVars } = data;
const { notebookSizeName, imageName, imageTagName, accelerator, envVars } = data;

const notebookSize = getNotebookSize(notebookSizeName);

Expand Down Expand Up @@ -186,39 +186,34 @@ export const assembleNotebook = async (
const tolerations: NotebookToleration[] = [];

let affinity: NotebookAffinity = {};
if (gpus > 0) {
if (accelerator.count > 0 && accelerator.accelerator) {
if (!resources.limits) {
resources.limits = {};
}
if (!resources.requests) {
resources.requests = {};
}
resources.limits[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
resources.requests[LIMIT_NOTEBOOK_IMAGE_GPU] = gpus;
tolerations.push({
effect: 'NoSchedule',
key: LIMIT_NOTEBOOK_IMAGE_GPU,
operator: 'Exists',
});
resources.limits[accelerator.accelerator.spec.identifier] = accelerator.count;
resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count;
} else {
affinity = {
nodeAffinity: {
preferredDuringSchedulingIgnoredDuringExecution: [
{
preference: {
matchExpressions: [
{
key: 'nvidia.com/gpu.present',
operator: 'NotIn',
values: ['true'],
},
],
},
weight: 1,
},
],
},
};
// step type down to string to avoid type errors
const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes);

Object.keys(resources.limits || {}).forEach((key) => {
if (!containerResourceKeys.includes(key)) {
delete resources.limits?.[key];
}
});

Object.keys(resources.requests || {}).forEach((key) => {
if (!containerResourceKeys.includes(key)) {
delete resources.requests?.[key];
}
});
}

if (accelerator.accelerator?.spec.tolerations) {
tolerations.push(...accelerator.accelerator.spec.tolerations);
}

if (tolerationSettings?.enabled) {
Expand Down Expand Up @@ -266,6 +261,7 @@ export const assembleNotebook = async (
'notebooks.opendatahub.io/last-image-selection': imageSelection,
'opendatahub.io/username': username,
'kubeflow-resource-stopped': null,
'opendatahub.io/accelerator-name': accelerator.accelerator?.metadata.name || '',
},
name: name,
namespace: namespace,
Expand Down
32 changes: 30 additions & 2 deletions backend/src/utils/resourceUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
} from './componentUtils';
import { createCustomError } from './requestUtils';
import { getAcceleratorNumbers } from '../routes/api/accelerators/acceleratorUtils';
import { getNotebooks } from './notebookUtils';

const dashboardConfigMapName = 'odh-dashboard-config';
const consoleLinksGroup = 'console.openshift.io';
Expand Down Expand Up @@ -678,7 +679,7 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
};

try {
await await fastify.kube.customObjectsApi.createNamespacedCustomObject(
await fastify.kube.customObjectsApi.createNamespacedCustomObject(
Gkrumbach07 marked this conversation as resolved.
Show resolved Hide resolved
'dashboard.opendatahub.io',
'v1alpha',
fastify.kube.namespace,
Expand All @@ -688,7 +689,34 @@ export const cleanupGPU = async (fastify: KubeFastifyInstance): Promise<void> =>
} catch (e) {
// If bad detection — exit early and dont create config
throw 'Unable to add migrated-gpu accelerator profile: ' + e.toString()
}
}

// update already running notebooks to use the new profile
const notebooks = await getNotebooks(fastify, fastify.kube.namespace)
notebooks.items.forEach(async (notebook) => {
const gpuCount = notebook.spec.template.spec.containers[0].resources?.limits?.['nvidia.com/gpu']
if (gpuCount) {
notebook.metadata.annotations = {
...notebook.metadata.annotations,
'opendatahub.io/recommended-accelerators' : 'migrated-gpu'
}
await fastify.kube.customObjectsApi.patchNamespacedCustomObject(
'kubeflow.org',
'v1',
fastify.kube.namespace,
'notebooks',
notebook.metadata.name,
notebook,
undefined,
undefined,
undefined,
{
headers: { 'Content-type': PatchUtils.PATCH_FORMAT_JSON_MERGE_PATCH },
},
)
}
}
)
};
}

Expand Down
2 changes: 1 addition & 1 deletion frontend/src/api/k8s/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ export const assemblePodSpecOptions = (
resources.requests[accelerator.accelerator.spec.identifier] = accelerator.count;
} else {
// step type down to string to avoid type errors
const containerResourceKeys: string[] = Object.keys(ContainerResourceAttributes);
const containerResourceKeys: string[] = Object.values(ContainerResourceAttributes);

Object.keys(resources.limits || {}).forEach((key) => {
if (!containerResourceKeys.includes(key)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
)?.[1];

if (detectedAcceleratorCount === undefined) {
return `No accelerator detected with the identifier ${accelerator?.spec.identifier} detected.`;
return `No accelerator detected with the identifier ${accelerator?.spec.identifier}.`;
} else if (newSize > detectedAcceleratorCount) {
return `Only ${detectedAcceleratorCount} accelerator${
detectedAcceleratorCount > 1 ? 's' : ''
Expand All @@ -61,9 +61,7 @@ const AcceleratorSelectField: React.FC<AcceleratorSelectFieldProps> = ({
);

React.useEffect(() => {
if (acceleratorCount > 0) {
setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount));
}
setAcceleratorCountWarning(validateAcceleratorCount(acceleratorCount));
}, [acceleratorCount, validateAcceleratorCount]);

const [acceleratorCountWarning, setAcceleratorCountWarning] = React.useState(
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/pages/projects/screens/spawner/spawnerUtils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import * as React from 'react';
import compareVersions from 'compare-versions';
import { BYONImage, K8sResourceCommon, NotebookSize, Volume, VolumeMount } from '~/types';
import { BYONImage, NotebookSize, Volume, VolumeMount } from '~/types';
import {
BuildKind,
ImageStreamKind,
Expand Down Expand Up @@ -414,7 +414,7 @@ export const isInvalidBYONImageStream = (imageStream: ImageStreamKind) => {
);
};

export const convertBYONImageToK8sResource = (image: BYONImage): K8sResourceCommon => ({
export const convertBYONImageToK8sResource = (image: BYONImage) => ({
kind: 'ImageStream',
apiVersion: 'image.openshift.io/v1',
metadata: {
Expand Down
Loading