From 742286f6486c6d7157aaf850380d2f1118233676 Mon Sep 17 00:00:00 2001 From: Mike Turley Date: Fri, 26 Apr 2024 17:17:23 -0400 Subject: [PATCH] [RHOAIENG-6519] DW: Support workloads owned by either a RayCluster or a Job when querying CPU and Memory usage Signed-off-by: Mike Turley --- .../mockDWUsageByOwnerPrometheusResponse.ts | 16 ++ .../src/__mocks__/mockWorkloadK8sResource.ts | 16 +- .../GlobalDistributedWorkloads.cy.ts | 11 +- .../__tests__/distributedWorkloads.spec.ts | 189 ++++++++++-------- .../api/prometheus/distributedWorkloads.ts | 77 ++++--- .../__tests__/utils.spec.ts | 28 ++- .../concepts/distributedWorkloads/utils.tsx | 25 ++- frontend/src/k8sTypes.ts | 5 + 8 files changed, 240 insertions(+), 127 deletions(-) create mode 100644 frontend/src/__mocks__/mockDWUsageByOwnerPrometheusResponse.ts diff --git a/frontend/src/__mocks__/mockDWUsageByOwnerPrometheusResponse.ts b/frontend/src/__mocks__/mockDWUsageByOwnerPrometheusResponse.ts new file mode 100644 index 0000000000..8782b642c9 --- /dev/null +++ b/frontend/src/__mocks__/mockDWUsageByOwnerPrometheusResponse.ts @@ -0,0 +1,16 @@ +import { WorkloadMetricIndexedByOwner, WorkloadMetricPromQueryResponse } from '~/api'; +import { WorkloadOwnerType } from '~/k8sTypes'; +import { mockPrometheusQueryVectorResponse } from './mockPrometheusQueryVectorResponse'; + +export const mockDWUsageByOwnerPrometheusResponse = ( + usageByOwner: WorkloadMetricIndexedByOwner, +): WorkloadMetricPromQueryResponse => + mockPrometheusQueryVectorResponse({ + result: Object.values(WorkloadOwnerType).flatMap((ownerKind) => + Object.keys(usageByOwner[ownerKind]).map((ownerName) => ({ + // eslint-disable-next-line camelcase + metric: { owner_kind: ownerKind, owner_name: ownerName }, + value: [0, String(usageByOwner[ownerKind][ownerName])], + })), + ), + }); diff --git a/frontend/src/__mocks__/mockWorkloadK8sResource.ts b/frontend/src/__mocks__/mockWorkloadK8sResource.ts index c43d1f2964..b6fe6bb57e 100644 --- a/frontend/src/__mocks__/mockWorkloadK8sResource.ts +++ b/frontend/src/__mocks__/mockWorkloadK8sResource.ts @@ -1,6 +1,6 @@ import { genUID } from '~/__mocks__/mockUtils'; import { WorkloadStatusType } from '~/concepts/distributedWorkloads/utils'; -import { WorkloadCondition, WorkloadKind, WorkloadPodSet } from '~/k8sTypes'; +import { WorkloadCondition, WorkloadKind, WorkloadOwnerType, WorkloadPodSet } from '~/k8sTypes'; const mockWorkloadStatusConditions: Record = { Pending: [ @@ -127,14 +127,16 @@ const mockWorkloadStatusConditions: Record ({ @@ -150,16 +152,16 @@ export const mockWorkloadK8sResource = ({ namespace, resourceVersion: '9279356', uid: genUID('workload'), - ...(ownerJobName + ...(ownerName ? { ownerReferences: [ { apiVersion: 'batch/v1', blockOwnerDeletion: true, controller: true, - kind: 'Job', - name: ownerJobName, - uid: genUID('job'), + kind: ownerKind, + name: ownerName, + uid: genUID(ownerKind.toLowerCase()), }, ], } diff --git a/frontend/src/__tests__/cypress/cypress/e2e/distributedWorkloads/GlobalDistributedWorkloads.cy.ts b/frontend/src/__tests__/cypress/cypress/e2e/distributedWorkloads/GlobalDistributedWorkloads.cy.ts index 74b90229bd..8b2b368df5 100644 --- a/frontend/src/__tests__/cypress/cypress/e2e/distributedWorkloads/GlobalDistributedWorkloads.cy.ts +++ b/frontend/src/__tests__/cypress/cypress/e2e/distributedWorkloads/GlobalDistributedWorkloads.cy.ts @@ -6,7 +6,7 @@ import { mockK8sResourceList } from '~/__mocks__/mockK8sResourceList'; import { mockProjectK8sResource } from '~/__mocks__/mockProjectK8sResource'; import { mockPrometheusQueryVectorResponse } from '~/__mocks__/mockPrometheusQueryVectorResponse'; import { mockWorkloadK8sResource } from '~/__mocks__/mockWorkloadK8sResource'; -import { ClusterQueueKind, LocalQueueKind, WorkloadKind } from '~/k8sTypes'; +import { ClusterQueueKind, LocalQueueKind, WorkloadKind, WorkloadOwnerType } from '~/k8sTypes'; import { WorkloadStatusType } from '~/concepts/distributedWorkloads/utils'; import { mockClusterQueueK8sResource } from '~/__mocks__/mockClusterQueueK8sResource'; import { mockLocalQueueK8sResource } from '~/__mocks__/mockLocalQueueK8sResource'; @@ -35,9 +35,16 @@ const initIntercepts = ({ mockLocalQueueK8sResource({ name: 'test-local-queue', namespace: 'test-project' }), ], workloads = [ - mockWorkloadK8sResource({ k8sName: 'test-workload', mockStatus: WorkloadStatusType.Succeeded }), + mockWorkloadK8sResource({ + k8sName: 'test-workload', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-workload-job', + mockStatus: WorkloadStatusType.Succeeded, + }), mockWorkloadK8sResource({ k8sName: 'test-workload-2', + ownerKind: WorkloadOwnerType.RayCluster, + ownerName: 'test-workload-2-rc', mockStatus: WorkloadStatusType.Succeeded, }), ], diff --git a/frontend/src/api/prometheus/__tests__/distributedWorkloads.spec.ts b/frontend/src/api/prometheus/__tests__/distributedWorkloads.spec.ts index 7003f12bc0..e3058b59df 100644 --- a/frontend/src/api/prometheus/__tests__/distributedWorkloads.spec.ts +++ b/frontend/src/api/prometheus/__tests__/distributedWorkloads.spec.ts @@ -2,68 +2,70 @@ import { act } from '@testing-library/react'; import axios from 'axios'; import { mockPrometheusQueryVectorResponse } from '~/__mocks__/mockPrometheusQueryVectorResponse'; import { mockWorkloadK8sResource } from '~/__mocks__/mockWorkloadK8sResource'; -import { WorkloadKind } from '~/k8sTypes'; -import { getWorkloadOwnerJobName } from '~/concepts/distributedWorkloads/utils'; +import { WorkloadKind, WorkloadOwnerType } from '~/k8sTypes'; +import { getWorkloadOwner } from '~/concepts/distributedWorkloads/utils'; import { testHook } from '~/__tests__/unit/testUtils/hooks'; import { POLL_INTERVAL } from '~/utilities/const'; import { DWProjectCurrentMetrics, + EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER, TopWorkloadsByUsage, WorkloadCurrentUsage, + WorkloadMetricIndexedByOwner, WorkloadMetricPromQueryResponse, getTopResourceConsumingWorkloads, getTotalUsage, - indexNumericValuesByJobName, + indexWorkloadMetricByOwner, useDWProjectCurrentMetrics, } from '~/api/prometheus/distributedWorkloads'; const mockCpuUsageResults: WorkloadMetricPromQueryResponse['data']['result'] = [ { metric: { - workload: 'test-job-1', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-1', // eslint-disable-line camelcase }, value: [1711495542.368, '0.00000150000000000'], }, { metric: { - workload: 'test-job-2', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-2', // eslint-disable-line camelcase }, value: [1711495542.368, '0.00000163333333333'], }, { metric: { - workload: 'test-job-3', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-3', // eslint-disable-line camelcase }, value: [1711495542.368, '0.0120015'], }, { metric: { - workload: 'test-job-4', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-4', // eslint-disable-line camelcase }, value: [1711495542.368, '0.04300163333333333'], }, { metric: { - workload: 'test-job-5', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-1', // eslint-disable-line camelcase }, value: [1711495542.368, '0.01100163333333333'], }, { metric: { - workload: 'test-job-6', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-2', // eslint-disable-line camelcase }, value: [1711495542.368, '0.01300163333333333'], }, { metric: { - workload: 'test-job-7', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-3', // eslint-disable-line camelcase }, value: [1711495542.368, '0.01500163333333333'], }, @@ -72,50 +74,50 @@ const mockCpuUsageResults: WorkloadMetricPromQueryResponse['data']['result'] = [ const mockMemoryUsageResults: WorkloadMetricPromQueryResponse['data']['result'] = [ { metric: { - workload: 'test-job-1', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-1', // eslint-disable-line camelcase }, value: [1711495542.37, '8237056'], }, { metric: { - workload: 'test-job-2', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-2', // eslint-disable-line camelcase }, value: [1711495542.37, '8249344'], }, { metric: { - workload: 'test-job-3', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-3', // eslint-disable-line camelcase }, value: [1711495542.37, '9349344'], }, { metric: { - workload: 'test-job-4', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.Job, // eslint-disable-line camelcase + owner_name: 'test-job-4', // eslint-disable-line camelcase }, value: [1711495542.37, '82493440'], }, { metric: { - workload: 'test-job-5', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-1', // eslint-disable-line camelcase }, value: [1711495542.37, '42493440'], }, { metric: { - workload: 'test-job-6', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-2', // eslint-disable-line camelcase }, value: [1711495542.37, '8237036'], }, { metric: { - workload: 'test-job-7', - workload_type: 'job', // eslint-disable-line camelcase + owner_kind: WorkloadOwnerType.RayCluster, // eslint-disable-line camelcase + owner_name: 'test-rc-3', // eslint-disable-line camelcase }, value: [1711495542.37, '10337050'], }, @@ -125,67 +127,90 @@ const mockWorkloads = [ mockWorkloadK8sResource({ k8sName: 'test-job-1-wl', namespace: 'test-project', - ownerJobName: 'test-job-1', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-job-1', }), mockWorkloadK8sResource({ k8sName: 'test-job-2-wl', namespace: 'test-project', - ownerJobName: 'test-job-2', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-job-2', }), mockWorkloadK8sResource({ k8sName: 'test-job-3-wl', namespace: 'test-project', - ownerJobName: 'test-job-3', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-job-3', }), mockWorkloadK8sResource({ k8sName: 'test-job-4-wl', namespace: 'test-project', - ownerJobName: 'test-job-4', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-job-4', }), mockWorkloadK8sResource({ - k8sName: 'test-job-5-wl', + k8sName: 'test-rc-1-wl', namespace: 'test-project', - ownerJobName: 'test-job-5', + ownerKind: WorkloadOwnerType.RayCluster, + ownerName: 'test-rc-1', }), mockWorkloadK8sResource({ - k8sName: 'test-job-6-wl', + k8sName: 'test-rc-2-wl', namespace: 'test-project', - ownerJobName: 'test-job-6', + ownerKind: WorkloadOwnerType.RayCluster, + ownerName: 'test-rc-2', }), mockWorkloadK8sResource({ - k8sName: 'test-job-7-wl', + k8sName: 'test-rc-3-wl', namespace: 'test-project', - ownerJobName: 'test-job-7', + ownerKind: WorkloadOwnerType.RayCluster, + ownerName: 'test-rc-3', }), ]; const mockGetWorkloadCurrentUsage = (workload: WorkloadKind): WorkloadCurrentUsage => { - const jobName = getWorkloadOwnerJobName(workload); + const owner = getWorkloadOwner(workload); return { - cpuCoresUsed: jobName - ? Number(mockCpuUsageResults.find(({ metric }) => metric.workload === jobName)?.value[1]) + cpuCoresUsed: owner + ? Number( + mockCpuUsageResults.find( + ( + { metric: { owner_kind, owner_name } }, // eslint-disable-line camelcase + ) => owner_kind === owner.kind && owner_name === owner.name, // eslint-disable-line camelcase + )?.value[1], + ) : undefined, - memoryBytesUsed: jobName - ? Number(mockMemoryUsageResults.find(({ metric }) => metric.workload === jobName)?.value[1]) + memoryBytesUsed: owner + ? Number( + mockMemoryUsageResults.find( + ( + { metric: { owner_kind, owner_name } }, // eslint-disable-line camelcase + ) => owner_kind === owner.kind && owner_name === owner.name, // eslint-disable-line camelcase + )?.value[1], + ) : undefined, }; }; -describe('indexNumericValuesByJobName', () => { +describe('indexWorkloadMetricByOwner', () => { it('converts Prometheus response data to an indexed structure', () => { const promResponse = mockPrometheusQueryVectorResponse({ result: mockCpuUsageResults, }); - const indexedValues: Record = { - 'test-job-1': 0.0000015, - 'test-job-2': 0.00000163333333333, - 'test-job-3': 0.0120015, - 'test-job-4': 0.04300163333333333, - 'test-job-5': 0.01100163333333333, - 'test-job-6': 0.01300163333333333, - 'test-job-7': 0.01500163333333333, + const indexedValues: WorkloadMetricIndexedByOwner = { + [WorkloadOwnerType.Job]: { + 'test-job-1': 0.0000015, + 'test-job-2': 0.00000163333333333, + 'test-job-3': 0.0120015, + 'test-job-4': 0.04300163333333333, + }, + [WorkloadOwnerType.RayCluster]: { + 'test-rc-1': 0.01100163333333333, + 'test-rc-2': 0.01300163333333333, + 'test-rc-3': 0.01500163333333333, + }, }; - expect(indexNumericValuesByJobName(promResponse)).toEqual(indexedValues); + expect(indexWorkloadMetricByOwner(promResponse)).toEqual(indexedValues); }); }); @@ -308,14 +333,14 @@ describe('useDWProjectCurrentMetrics', () => { ); expect(renderResult).hookToStrictEqual({ data: { - cpuCoresUsedByJobName: { - data: {}, + cpuCoresUsedByWorkloadOwner: { + data: EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER, error: undefined, loaded: false, refresh: expect.any(Function), }, - memoryBytesUsedByJobName: { - data: {}, + memoryBytesUsedByWorkloadOwner: { + data: EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER, error: undefined, loaded: false, refresh: expect.any(Function), @@ -333,11 +358,11 @@ describe('useDWProjectCurrentMetrics', () => { expect(mockAxios).toHaveBeenCalledTimes(2); expect(mockAxios).toHaveBeenCalledWith('/api/prometheus/query', { query: - 'namespace=test-project&query=sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="", namespace="test-project"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="test-project", workload_type="job"}) by (workload, workload_type)', + 'namespace=test-project&query=sum by(owner_name, owner_kind) (kube_pod_owner{owner_kind=~"RayCluster|Job", namespace="test-project"} * on (namespace, pod) group_right(owner_name, owner_kind) node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate)', }); expect(mockAxios).toHaveBeenCalledWith('/api/prometheus/query', { query: - 'namespace=test-project&query=sum(container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", cluster="", namespace="test-project", container!="", image!=""} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="test-project", workload_type="job"}) by (workload, workload_type)', + 'namespace=test-project&query=sum by(owner_name, owner_kind) (kube_pod_owner{owner_kind=~"RayCluster|Job", namespace="test-project"} * on (namespace, pod) group_right(owner_name, owner_kind) node_namespace_pod_container:container_memory_working_set_bytes)', }); expect(renderResult).hookToHaveUpdateCount(1); @@ -345,29 +370,37 @@ describe('useDWProjectCurrentMetrics', () => { await renderResult.waitForNextUpdate(); const expectedResult: DWProjectCurrentMetrics = { data: { - cpuCoresUsedByJobName: { + cpuCoresUsedByWorkloadOwner: { data: { - 'test-job-1': 0.0000015, - 'test-job-2': 0.00000163333333333, - 'test-job-3': 0.0120015, - 'test-job-4': 0.04300163333333333, - 'test-job-5': 0.01100163333333333, - 'test-job-6': 0.01300163333333333, - 'test-job-7': 0.01500163333333333, + [WorkloadOwnerType.Job]: { + 'test-job-1': 0.0000015, + 'test-job-2': 0.00000163333333333, + 'test-job-3': 0.0120015, + 'test-job-4': 0.04300163333333333, + }, + [WorkloadOwnerType.RayCluster]: { + 'test-rc-1': 0.01100163333333333, + 'test-rc-2': 0.01300163333333333, + 'test-rc-3': 0.01500163333333333, + }, }, error: undefined, loaded: true, refresh: expect.any(Function), }, - memoryBytesUsedByJobName: { + memoryBytesUsedByWorkloadOwner: { data: { - 'test-job-1': 8237056, - 'test-job-2': 8249344, - 'test-job-3': 9349344, - 'test-job-4': 82493440, - 'test-job-5': 42493440, - 'test-job-6': 8237036, - 'test-job-7': 10337050, + [WorkloadOwnerType.Job]: { + 'test-job-1': 8237056, + 'test-job-2': 8249344, + 'test-job-3': 9349344, + 'test-job-4': 82493440, + }, + [WorkloadOwnerType.RayCluster]: { + 'test-rc-1': 42493440, + 'test-rc-2': 8237036, + 'test-rc-3': 10337050, + }, }, error: undefined, loaded: true, diff --git a/frontend/src/api/prometheus/distributedWorkloads.ts b/frontend/src/api/prometheus/distributedWorkloads.ts index 0b2d44f2f5..06e418cf07 100644 --- a/frontend/src/api/prometheus/distributedWorkloads.ts +++ b/frontend/src/api/prometheus/distributedWorkloads.ts @@ -2,33 +2,46 @@ import * as React from 'react'; import { FetchStateObject, PrometheusQueryResponse } from '~/types'; import { useMakeFetchObject } from '~/utilities/useMakeFetchObject'; import { DEFAULT_VALUE_FETCH_STATE } from '~/utilities/const'; -import { WorkloadKind } from '~/k8sTypes'; -import { getWorkloadOwnerJobName } from '~/concepts/distributedWorkloads/utils'; +import { WorkloadKind, WorkloadOwnerType } from '~/k8sTypes'; +import { getWorkloadOwner } from '~/concepts/distributedWorkloads/utils'; import usePrometheusQuery from './usePrometheusQuery'; +export type WorkloadMetricIndexedByOwner = Record< + WorkloadOwnerType, + { [ownerName: string]: number } +>; + +export const EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER: WorkloadMetricIndexedByOwner = { + [WorkloadOwnerType.RayCluster]: {}, + [WorkloadOwnerType.Job]: {}, +}; + export type WorkloadMetricPromQueryResponse = PrometheusQueryResponse<{ - metric: { workload: string; workload_type: string }; + metric: { owner_kind: WorkloadOwnerType; owner_name: string }; }>; -export const indexNumericValuesByJobName = ( +export const indexWorkloadMetricByOwner = ( promResponse: WorkloadMetricPromQueryResponse | null, -): Record => { +): WorkloadMetricIndexedByOwner => { if (!promResponse) { - return {}; + return EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER; } return promResponse.data.result.reduce((acc, { metric, value }) => { const valueStr = value[1]; if (valueStr && !Number.isNaN(Number(valueStr))) { - acc[metric.workload] = Number(valueStr); + return { + ...acc, + [metric.owner_kind]: { ...acc[metric.owner_kind], [metric.owner_name]: Number(valueStr) }, + }; } return acc; - }, {} as Record); + }, EMPTY_WORKLOAD_METRIC_INDEXED_BY_OWNER); }; -const useWorkloadMetricIndexedByJobName = ( +const useWorkloadMetricIndexedByOwner = ( query?: string, refreshRate = 0, -): FetchStateObject> => { +): FetchStateObject => { const promQueryFetchObj = useMakeFetchObject( usePrometheusQuery('/api/prometheus/query', query, { refreshRate, @@ -37,7 +50,7 @@ const useWorkloadMetricIndexedByJobName = ( return React.useMemo( () => ({ ...promQueryFetchObj, - data: indexNumericValuesByJobName(promQueryFetchObj.data), + data: indexWorkloadMetricByOwner(promQueryFetchObj.data), }), [promQueryFetchObj], ); @@ -100,8 +113,8 @@ export const getTopResourceConsumingWorkloads = ( }; export type DWProjectCurrentMetricsValues = { - cpuCoresUsedByJobName: Record; - memoryBytesUsedByJobName: Record; + cpuCoresUsedByWorkloadOwner: WorkloadMetricIndexedByOwner; + memoryBytesUsedByWorkloadOwner: WorkloadMetricIndexedByOwner; }; export type DWProjectCurrentMetricType = keyof DWProjectCurrentMetricsValues; export type DWProjectCurrentMetrics = FetchStateObject<{ @@ -116,8 +129,8 @@ export type DWProjectCurrentMetrics = FetchStateObject<{ export const DEFAULT_DW_PROJECT_CURRENT_METRICS: DWProjectCurrentMetrics = { ...DEFAULT_VALUE_FETCH_STATE, data: { - cpuCoresUsedByJobName: DEFAULT_VALUE_FETCH_STATE, - memoryBytesUsedByJobName: DEFAULT_VALUE_FETCH_STATE, + cpuCoresUsedByWorkloadOwner: DEFAULT_VALUE_FETCH_STATE, + memoryBytesUsedByWorkloadOwner: DEFAULT_VALUE_FETCH_STATE, }, getWorkloadCurrentUsage: () => ({ cpuCoresUsed: undefined, memoryBytesUsed: undefined }), topWorkloadsByUsage: { @@ -129,8 +142,8 @@ export const DEFAULT_DW_PROJECT_CURRENT_METRICS: DWProjectCurrentMetrics = { const getDWProjectCurrentMetricsQueries = ( namespace: string, ): Record => ({ - cpuCoresUsedByJobName: `namespace=${namespace}&query=sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="", namespace="${namespace}"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="${namespace}", workload_type="job"}) by (workload, workload_type)`, - memoryBytesUsedByJobName: `namespace=${namespace}&query=sum(container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", cluster="", namespace="${namespace}", container!="", image!=""} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="${namespace}", workload_type="job"}) by (workload, workload_type)`, + cpuCoresUsedByWorkloadOwner: `namespace=${namespace}&query=sum by(owner_name, owner_kind) (kube_pod_owner{owner_kind=~"RayCluster|Job", namespace="${namespace}"} * on (namespace, pod) group_right(owner_name, owner_kind) node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate)`, + memoryBytesUsedByWorkloadOwner: `namespace=${namespace}&query=sum by(owner_name, owner_kind) (kube_pod_owner{owner_kind=~"RayCluster|Job", namespace="${namespace}"} * on (namespace, pod) group_right(owner_name, owner_kind) node_namespace_pod_container:container_memory_working_set_bytes)`, }); export const useDWProjectCurrentMetrics = ( @@ -140,26 +153,30 @@ export const useDWProjectCurrentMetrics = ( ): DWProjectCurrentMetrics => { const queries = getDWProjectCurrentMetricsQueries(namespace); const data: DWProjectCurrentMetrics['data'] = { - cpuCoresUsedByJobName: useWorkloadMetricIndexedByJobName( - queries.cpuCoresUsedByJobName, + cpuCoresUsedByWorkloadOwner: useWorkloadMetricIndexedByOwner( + queries.cpuCoresUsedByWorkloadOwner, refreshRate, ), - memoryBytesUsedByJobName: useWorkloadMetricIndexedByJobName( - queries.memoryBytesUsedByJobName, + memoryBytesUsedByWorkloadOwner: useWorkloadMetricIndexedByOwner( + queries.memoryBytesUsedByWorkloadOwner, refreshRate, ), }; - const cpuCoresUsedByJobNameRefresh = data.cpuCoresUsedByJobName.refresh; - const memoryBytesUsedByJobNameRefresh = data.memoryBytesUsedByJobName.refresh; + const cpuCoresUsedByWorkloadOwnerRefresh = data.cpuCoresUsedByWorkloadOwner.refresh; + const memoryBytesUsedByWorkloadOwnerRefresh = data.memoryBytesUsedByWorkloadOwner.refresh; const getWorkloadCurrentUsage = React.useCallback( (workload: WorkloadKind) => { - const jobName = getWorkloadOwnerJobName(workload); + const owner = getWorkloadOwner(workload); return { - cpuCoresUsed: jobName ? data.cpuCoresUsedByJobName.data?.[jobName] : undefined, - memoryBytesUsed: jobName ? data.memoryBytesUsedByJobName.data?.[jobName] : undefined, + cpuCoresUsed: owner + ? data.cpuCoresUsedByWorkloadOwner.data?.[owner.kind][owner.name] + : undefined, + memoryBytesUsed: owner + ? data.memoryBytesUsedByWorkloadOwner.data?.[owner.kind][owner.name] + : undefined, }; }, - [data.cpuCoresUsedByJobName, data.memoryBytesUsedByJobName], + [data.cpuCoresUsedByWorkloadOwner, data.memoryBytesUsedByWorkloadOwner], ); const topWorkloadsByUsage: TopWorkloadsByUsage = React.useMemo( () => getTopResourceConsumingWorkloads(workloads, getWorkloadCurrentUsage), @@ -168,9 +185,9 @@ export const useDWProjectCurrentMetrics = ( return { data, refresh: React.useCallback(() => { - cpuCoresUsedByJobNameRefresh(); - memoryBytesUsedByJobNameRefresh(); - }, [cpuCoresUsedByJobNameRefresh, memoryBytesUsedByJobNameRefresh]), + cpuCoresUsedByWorkloadOwnerRefresh(); + memoryBytesUsedByWorkloadOwnerRefresh(); + }, [cpuCoresUsedByWorkloadOwnerRefresh, memoryBytesUsedByWorkloadOwnerRefresh]), loaded: Object.values(data).every(({ loaded }) => loaded), error: Object.values(data).find(({ error }) => !!error)?.error, getWorkloadCurrentUsage, diff --git a/frontend/src/concepts/distributedWorkloads/__tests__/utils.spec.ts b/frontend/src/concepts/distributedWorkloads/__tests__/utils.spec.ts index 78f6de1259..0924e63951 100644 --- a/frontend/src/concepts/distributedWorkloads/__tests__/utils.spec.ts +++ b/frontend/src/concepts/distributedWorkloads/__tests__/utils.spec.ts @@ -2,7 +2,7 @@ import { mockClusterQueueK8sResource } from '~/__mocks__/mockClusterQueueK8sReso import { mockLocalQueueK8sResource } from '~/__mocks__/mockLocalQueueK8sResource'; import { mockWorkloadK8sResource } from '~/__mocks__/mockWorkloadK8sResource'; import { - getWorkloadOwnerJobName, + getWorkloadOwner, WorkloadStatusColorAndIcon, WorkloadStatusType, getStatusCounts, @@ -12,7 +12,7 @@ import { getQueueRequestedResources, getTotalSharedQuota, } from '~/concepts/distributedWorkloads/utils'; -import { WorkloadPodSet } from '~/k8sTypes'; +import { WorkloadOwnerType, WorkloadPodSet } from '~/k8sTypes'; import { PodContainer } from '~/types'; describe('getStatusInfo', () => { @@ -96,14 +96,28 @@ describe('getStatusCounts', () => { }); }); -describe('getWorkloadOwnerJobName', () => { - it('returns the name of the job found in ownerReferences of a workload if present', () => { +describe('getWorkloadOwner', () => { + it('returns the name of a job found in ownerReferences of a workload if present', () => { const mockWorkload = mockWorkloadK8sResource({ k8sName: 'test-workload', namespace: 'test-project', - ownerJobName: 'test-job', + ownerKind: WorkloadOwnerType.Job, + ownerName: 'test-job', + }); + expect(getWorkloadOwner(mockWorkload)).toStrictEqual({ kind: 'Job', name: 'test-job' }); + }); + + it('returns the name of a raycluster found in ownerReferences of a workload if present', () => { + const mockWorkload = mockWorkloadK8sResource({ + k8sName: 'test-workload', + namespace: 'test-project', + ownerKind: WorkloadOwnerType.RayCluster, + ownerName: 'test-raycluster', + }); + expect(getWorkloadOwner(mockWorkload)).toStrictEqual({ + kind: 'RayCluster', + name: 'test-raycluster', }); - expect(getWorkloadOwnerJobName(mockWorkload)).toBe('test-job'); }); it('returns undefined if there is no job in ownerReferences', () => { @@ -111,7 +125,7 @@ describe('getWorkloadOwnerJobName', () => { k8sName: 'test-workload', namespace: 'test-project', }); - expect(getWorkloadOwnerJobName(mockWorkload)).toBe(undefined); + expect(getWorkloadOwner(mockWorkload)).toBe(undefined); }); }); diff --git a/frontend/src/concepts/distributedWorkloads/utils.tsx b/frontend/src/concepts/distributedWorkloads/utils.tsx index 32b1397e21..d4541a1a57 100644 --- a/frontend/src/concepts/distributedWorkloads/utils.tsx +++ b/frontend/src/concepts/distributedWorkloads/utils.tsx @@ -19,7 +19,13 @@ import { chart_color_green_300 as chartColorGreen, chart_color_red_100 as chartColorRed, } from '@patternfly/react-tokens'; -import { ClusterQueueKind, LocalQueueKind, WorkloadCondition, WorkloadKind } from '~/k8sTypes'; +import { + ClusterQueueKind, + LocalQueueKind, + WorkloadCondition, + WorkloadKind, + WorkloadOwnerType, +} from '~/k8sTypes'; import { ContainerResourceAttributes } from '~/types'; import { CPU_UNITS, @@ -149,8 +155,21 @@ export const getStatusCounts = (workloads: WorkloadKind[]): WorkloadStatusCounts return statusCounts; }; -export const getWorkloadOwnerJobName = (workload: WorkloadKind): string | undefined => - workload.metadata?.ownerReferences?.find((ref) => ref.kind === 'Job')?.name; +export const isKnownWorkloadOwnerType = (s: string): s is WorkloadOwnerType => + (Object.values(WorkloadOwnerType) as string[]).includes(s); + +export const getWorkloadOwner = ( + workload: WorkloadKind, +): { kind: WorkloadOwnerType; name: string } | undefined => { + const owner = workload.metadata?.ownerReferences?.find((ref) => + isKnownWorkloadOwnerType(ref.kind), + ); + if (!owner || !isKnownWorkloadOwnerType(owner.kind)) { + return undefined; + } + const { kind, name } = owner; + return { kind, name }; +}; export type WorkloadRequestedResources = { cpuCoresRequested: number; diff --git a/frontend/src/k8sTypes.ts b/frontend/src/k8sTypes.ts index a3a4ff9c27..1955029eed 100644 --- a/frontend/src/k8sTypes.ts +++ b/frontend/src/k8sTypes.ts @@ -919,6 +919,11 @@ export type WorkloadPodSet = { }; }; +export enum WorkloadOwnerType { + RayCluster = 'RayCluster', + Job = 'Job', +} + // https://kueue.sigs.k8s.io/docs/reference/kueue.v1beta1/#kueue-x-k8s-io-v1beta1-Workload export type WorkloadKind = K8sResourceCommon & { apiVersion: 'kueue.x-k8s.io/v1beta1';