88import type { IScopedClusterClient } from '@kbn/core/server' ;
99import { ALL_VALUE } from '@kbn/slo-schema' ;
1010import type { TransformGetTransformStatsTransformStats } from 'elasticsearch-8.x/lib/api/types' ;
11- import { groupBy , keyBy , type Dictionary } from 'lodash' ;
12- import moment from 'moment' ;
11+ import { keyBy , type Dictionary } from 'lodash' ;
1312import {
14- SUMMARY_DESTINATION_INDEX_PATTERN ,
1513 getSLOSummaryTransformId ,
1614 getSLOTransformId ,
1715 getWildcardTransformId ,
1816} from '../../../common/constants' ;
19- import type { EsSummaryDocument } from '../../services/summary_transform_generator/helpers/create_temp_summary' ;
20- import type { HealthStatus , State , TransformHealth } from '../models/health' ;
17+ import type { TransformHealth } from '../models/health' ;
2118
2219interface Item {
2320 id : string ;
@@ -36,63 +33,28 @@ export interface SLOHealth {
3633 sloInstanceId : string ;
3734 sloRevision : number ;
3835 sloName : string ;
39- state : State ;
4036 health : {
41- overall : HealthStatus ;
37+ isProblematic : boolean ;
4238 rollup : TransformHealth ;
4339 summary : TransformHealth ;
4440 } ;
4541}
4642
47- const LAG_THRESHOLD_MINUTES = 10 ;
48- const STALE_THRESHOLD_MINUTES = 2 * 24 * 60 ;
49-
5043export async function computeHealth ( list : Item [ ] , deps : Dependencies ) : Promise < SLOHealth [ ] > {
51- const [ summaryDocsById , transformStatsById ] = await Promise . all ( [
52- getSummaryDocsById ( list , deps ) ,
53- getTransformStatsById ( list , deps ) ,
54- ] ) ;
55-
44+ const transformStatsById = await getTransformStatsById ( list , deps ) ;
5645 return list . map ( ( item ) => {
57- const health = computeTransformsHealth ( transformStatsById , item ) ;
58- const state = computeTransformState ( summaryDocsById , item ) ;
46+ const health = computeItemHealth ( transformStatsById , item ) ;
5947
6048 return {
6149 sloId : item . id ,
6250 sloName : item . name ,
6351 sloInstanceId : item . instanceId ?? ALL_VALUE ,
6452 sloRevision : item . revision ,
65- state,
6653 health,
6754 } ;
6855 } ) ;
6956}
7057
71- async function getSummaryDocsById ( list : Item [ ] , deps : Dependencies ) {
72- const summaryDocs = await deps . scopedClusterClient . asCurrentUser . search < EsSummaryDocument > ( {
73- index : SUMMARY_DESTINATION_INDEX_PATTERN ,
74- query : {
75- bool : {
76- should : list . map ( ( item ) => ( {
77- bool : {
78- must : [
79- { term : { 'slo.id' : item . id } } ,
80- { term : { 'slo.instanceId' : item . instanceId ?? ALL_VALUE } } ,
81- { term : { 'slo.revision' : item . revision } } ,
82- ] ,
83- } ,
84- } ) ) ,
85- } ,
86- } ,
87- } ) ;
88-
89- const summaryDocsById = groupBy (
90- summaryDocs . hits . hits . map ( ( hit ) => hit . _source ! ) ,
91- ( doc : EsSummaryDocument ) => buildSummaryKey ( doc . slo . id , doc . slo . instanceId )
92- ) ;
93- return summaryDocsById ;
94- }
95-
9658async function getTransformStatsById (
9759 list : Item [ ] ,
9860 deps : Dependencies
@@ -109,89 +71,84 @@ async function getTransformStatsById(
10971 return keyBy ( stats . transforms , ( transform ) => transform . id ) ;
11072}
11173
112- function computeTransformState (
113- summaryDocsById : Dictionary < EsSummaryDocument [ ] > ,
74+ function computeItemHealth (
75+ transformStatsById : Dictionary < TransformGetTransformStatsTransformStats > ,
11476 item : Item
115- ) : State {
116- const sloSummaryDocs = summaryDocsById [ buildSummaryKey ( item . id , item . instanceId ) ] ;
77+ ) : { rollup : TransformHealth ; summary : TransformHealth ; isProblematic : boolean } {
78+ const rollup = getTransformHealth (
79+ item ,
80+ transformStatsById [ getSLOTransformId ( item . id , item . revision ) ]
81+ ) ;
82+ const summary = getTransformHealth (
83+ item ,
84+ transformStatsById [ getSLOSummaryTransformId ( item . id , item . revision ) ]
85+ ) ;
11786
118- let state : State = 'no_data' ;
119- if ( ! sloSummaryDocs ) {
120- return state ;
121- }
122- const hasOnlyTempSummaryDoc = sloSummaryDocs . every ( ( doc ) => doc . isTempDoc ) ; // only temporary documents mean the summary transform did not run yet
123- const sloSummarydoc = sloSummaryDocs . find ( ( doc ) => ! doc . isTempDoc ) ;
124- const latestSliTimestamp = sloSummarydoc ?. latestSliTimestamp ;
125- const summaryUpdatedAt = sloSummarydoc ?. summaryUpdatedAt ;
126-
127- if ( hasOnlyTempSummaryDoc ) {
128- state = 'no_data' ;
129- } else if ( summaryUpdatedAt && latestSliTimestamp ) {
130- const summaryLag = moment ( ) . diff ( new Date ( summaryUpdatedAt ) , 'minute' ) ;
131- const indexingLag = moment ( summaryUpdatedAt ) . diff ( new Date ( latestSliTimestamp ) , 'minute' ) ;
132-
133- // When the summaryUpdatedAt is greater than STALE_THRESHOLD_MINUTES minutes, the SLO is considered stale since no new data triggered a summary document update.
134- // When the difference between the summaryUpdatedAt and the latestSliTimestamp is
135- // - Below LAG_THRESHOLD_MINUTES minutes, the SLO has cought up with the sli data, and is running correctly
136- // - Above LAG_THRESHOLD_MINUTES minutes, the SLO is indexing
137- if ( summaryLag > STALE_THRESHOLD_MINUTES ) {
138- state = 'stale' ;
139- } else {
140- state = indexingLag >= LAG_THRESHOLD_MINUTES ? 'indexing' : 'running' ;
141- }
142- }
143- return state ;
87+ const isProblematic = rollup . isProblematic || summary . isProblematic ;
88+
89+ return { isProblematic, rollup, summary } ;
14490}
14591
14692function getTransformHealth (
93+ item : Item ,
14794 transformStat ?: TransformGetTransformStatsTransformStats
14895) : TransformHealth {
14996 if ( ! transformStat ) {
15097 return {
151- status : 'missing' ,
98+ isProblematic : true ,
99+ missing : true ,
100+ status : 'unavailable' ,
152101 state : 'unavailable' ,
153102 } ;
154103 }
155104
156- const transformStatus = transformStat . health ?. status ?. toLowerCase ( ) ;
157- const transformState = transformStat . state ?. toLowerCase ( ) ;
105+ const state = toTransformState ( transformStat . state ?. toLowerCase ( ) ) ;
106+
107+ const status = toTransformStatus ( transformStat . health ?. status ?. toLowerCase ( ) ) ;
108+ const stateMatches =
109+ ( ! item . enabled && [ 'stopped' , 'stopping' , 'aborting' ] . includes ( state ) ) ||
110+ ( item . enabled && [ 'started' , 'indexing' ] . includes ( state ) ) ;
111+
112+ const isProblematic = status === 'unhealthy' || state === 'failed' || ! stateMatches ;
158113
159114 return {
160- status : transformStatus === 'green' ? 'healthy' : 'unhealthy' ,
161- state :
162- transformState === 'started'
163- ? 'started'
164- : transformState === 'stopped'
165- ? 'stopped'
166- : 'unavailable' ,
115+ isProblematic,
116+ missing : false ,
117+ status,
118+ state,
119+ stateMatches,
167120 } ;
168121}
169122
170- function computeTransformsHealth (
171- transformStatsById : Dictionary < TransformGetTransformStatsTransformStats > ,
172- item : Item
173- ) : { overall : HealthStatus ; rollup : TransformHealth ; summary : TransformHealth } {
174- const rollup = getTransformHealth ( transformStatsById [ getSLOTransformId ( item . id , item . revision ) ] ) ;
175- const summary = getTransformHealth (
176- transformStatsById [ getSLOSummaryTransformId ( item . id , item . revision ) ]
177- ) ;
178-
179- const rollupStateMatchesSloEnabled =
180- ( item . enabled && rollup . state === 'started' ) || ( ! item . enabled && rollup . state === 'stopped' ) ;
181- const summaryStateMatchesSloEnabled =
182- ( item . enabled && rollup . state === 'started' ) || ( ! item . enabled && rollup . state === 'stopped' ) ;
183-
184- const overall : HealthStatus =
185- rollup . status === 'healthy' &&
186- rollupStateMatchesSloEnabled &&
187- summary . status === 'healthy' &&
188- summaryStateMatchesSloEnabled
189- ? 'healthy'
190- : 'unhealthy' ;
191-
192- return { overall, rollup, summary } ;
123+ function toTransformState (
124+ state : string
125+ ) : 'started' | 'indexing' | 'stopped' | 'stopping' | 'failed' | 'aborting' | 'unavailable' {
126+ switch ( state ) {
127+ case 'started' :
128+ return 'started' ;
129+ case 'indexing' :
130+ return 'indexing' ;
131+ case 'stopped' :
132+ return 'stopped' ;
133+ case 'stopping' :
134+ return 'stopping' ;
135+ case 'failed' :
136+ return 'failed' ;
137+ case 'aborting' :
138+ return 'aborting' ;
139+ default :
140+ return 'unavailable' ;
141+ }
193142}
194143
195- function buildSummaryKey ( id : string , instanceId : string = ALL_VALUE ) {
196- return id + '|' + instanceId ;
144+ function toTransformStatus ( status ?: string ) : 'healthy' | 'unhealthy' | 'unavailable' {
145+ switch ( status ) {
146+ case 'green' :
147+ return 'healthy' ;
148+ case 'red' :
149+ case 'yellow' :
150+ return 'unhealthy' ;
151+ default :
152+ return 'unavailable' ;
153+ }
197154}
0 commit comments