From d939670cbf1b168499ec689ff470d94135fea1cf Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Tue, 23 Sep 2025 17:12:08 +0100 Subject: [PATCH 1/5] feat(operator): Added LokistackPendingComponents alert --- operator/docs/lokistack/sop.md | 45 +++++++++++++++++++ .../internal/alerts/prometheus-alerts.yaml | 11 +++++ .../internal/alerts/testdata/test.yaml | 16 +++++++ 3 files changed, 72 insertions(+) diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index 51048360f3374..ab12235f783c1 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -365,3 +365,48 @@ The schema configuration does not contain the most recent schema version and nee ### Steps - Add a new object storage schema V13 with a future EffectiveDate + +## Lokistack Components Not Ready + +### Impact + +One or more LokiStack components are not ready, which can disrupt ingestion or querying and lead to degraded service. + +### Summary + +The LokiStack reports that some components have not reached the `Ready` state. This might be related to Kubernetes resources (Pods/Deployments), configuration, or external dependencies. + +### Severity + +`Critical` + +### Access Required + +- Console access to the cluster +- Edit or view access in the namespace where the LokiStack is deployed: + - OpenShift + - `openshift-logging` (LokiStack) + +### Steps + +- Inspect the LokiStack conditions and events + - Describe the LokiStack resource and review status conditions: + - `kubectl -n describe lokistack ` + - Check for conditions that would lead to some pods not being in the `Ready` state +- Check operator and reconciliation status + - Ensure the Loki Operator is running and not reporting errors: + - `kubectl -n logs deploy/loki-operator` + - Look for reconcile errors related to missing permissions, invalid fields, or failed rollouts. +- Verify component Pods and Deployments + - Ensure all core components are running and Ready in the LokiStack namespace: + - `distributor`, `ingester`, `querier`, `query-frontend`, `index-gateway`, `compactor`, `gateway` + - Check Pod readiness and recent restarts: + - `kubectl -n get pods` + - `kubectl -n describe pod ` +- Examine Kubernetes events for failures + - `kubectl -n get events --sort-by=.lastTimestamp` + - Common causes: image pull backoffs, failed mounts, readiness probe failures, or insufficient resources +- Validate configuration and referenced resources + - Confirm referenced `Secrets`, `ConfigMaps`, exist and have correct keys +- Look into the Pod logs of the component that still not `Ready`: + - `kubectl -n logs ` \ No newline at end of file diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index abd62b9198e65..2273305ab849f 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -209,3 +209,14 @@ groups: for: 1m labels: severity: warning + - alert: LokistackComponentsNotReady + annotations: + description: |- + The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" has components that are not ready. + summary: "One or more LokiStack components are not ready." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Pending-Components" + expr: | + sum (lokistack_status_condition{reason="ReadyComponents", status="false"}) by (stack_name, stack_namespace, reason, status) == 1 + for: 10m + labels: + severity: critical diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index 8d89813933af4..dcd2f7f9523eb 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -66,6 +66,9 @@ tests: - series: 'loki_discarded_samples_total{namespace="my-ns", tenant="application", reason="line_too_long"}' values: '0x5 0+120x25 3000' + - series: 'lokistack_status_condition{stack_name="mystack", stack_namespace="my-ns", reason="ReadyComponents", status="false"}' + values: '0+0x15 1+0x10' + alert_rule_test: - eval_time: 16m alertname: LokiRequestErrors @@ -194,3 +197,16 @@ tests: Samples are discarded because of "line_too_long" at a rate of 2 samples per second. summary: Loki is discarding samples during ingestion because they fail validation. runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + - eval_time: 26m + alertname: LokistackComponentsNotReady + exp_alerts: + - exp_labels: + stack_namespace: my-ns + stack_name: mystack + reason: ReadyComponents + status: "false" + severity: critical + exp_annotations: + description: 'The LokiStack "mystack" in namespace "my-ns" has components that are not ready.' + summary: "One or more LokiStack components are not ready." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Pending-Components" From f2133fc573f76e5578f59c39c30629a4f6e2040f Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Tue, 30 Sep 2025 11:03:53 +0100 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Robert Jacob Signed-off-by: Joao Marcal --- operator/docs/lokistack/sop.md | 6 +++--- .../manifests/internal/alerts/prometheus-alerts.yaml | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index ab12235f783c1..fd1507f48c43a 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -395,7 +395,7 @@ The LokiStack reports that some components have not reached the `Ready` state. T - Check for conditions that would lead to some pods not being in the `Ready` state - Check operator and reconciliation status - Ensure the Loki Operator is running and not reporting errors: - - `kubectl -n logs deploy/loki-operator` + - `kubectl -n logs deploy/loki-operator-controller-manager` - Look for reconcile errors related to missing permissions, invalid fields, or failed rollouts. - Verify component Pods and Deployments - Ensure all core components are running and Ready in the LokiStack namespace: @@ -407,6 +407,6 @@ The LokiStack reports that some components have not reached the `Ready` state. T - `kubectl -n get events --sort-by=.lastTimestamp` - Common causes: image pull backoffs, failed mounts, readiness probe failures, or insufficient resources - Validate configuration and referenced resources - - Confirm referenced `Secrets`, `ConfigMaps`, exist and have correct keys + - Confirm referenced `Secrets` and `ConfigMaps` exist and have correct keys - Look into the Pod logs of the component that still not `Ready`: - - `kubectl -n logs ` \ No newline at end of file + - `kubectl -n logs ` \ No newline at end of file diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index 2273305ab849f..95164975e23ac 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -216,7 +216,10 @@ groups: summary: "One or more LokiStack components are not ready." runbook_url: "[[ .RunbookURL ]]#Lokistack-Pending-Components" expr: | - sum (lokistack_status_condition{reason="ReadyComponents", status="false"}) by (stack_name, stack_namespace, reason, status) == 1 + sum ( + lokistack_status_condition{reason="ReadyComponents", status="false"} + ) by (stack_name, stack_namespace) + > 0 for: 10m labels: severity: critical From b1251873f20359aa6179477f6903ad1dbacd065c Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Tue, 30 Sep 2025 11:15:58 +0100 Subject: [PATCH 3/5] fix test --- operator/internal/manifests/internal/alerts/testdata/test.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index dcd2f7f9523eb..975ee2f880d94 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -203,8 +203,6 @@ tests: - exp_labels: stack_namespace: my-ns stack_name: mystack - reason: ReadyComponents - status: "false" severity: critical exp_annotations: description: 'The LokiStack "mystack" in namespace "my-ns" has components that are not ready.' From 4bf64d793190b70434802b44b06537b02d2033bf Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Tue, 30 Sep 2025 15:58:54 +0100 Subject: [PATCH 4/5] renamed alert and downgraded to warning --- operator/docs/lokistack/sop.md | 4 ++-- .../manifests/internal/alerts/prometheus-alerts.yaml | 8 ++++---- .../manifests/internal/alerts/testdata/test.yaml | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index fd1507f48c43a..25472103e029f 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -366,7 +366,7 @@ The schema configuration does not contain the most recent schema version and nee - Add a new object storage schema V13 with a future EffectiveDate -## Lokistack Components Not Ready +## Lokistack Components Not Ready Warning ### Impact @@ -378,7 +378,7 @@ The LokiStack reports that some components have not reached the `Ready` state. T ### Severity -`Critical` +`Warning` ### Access Required diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index 95164975e23ac..c3a90af066eb7 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -209,17 +209,17 @@ groups: for: 1m labels: severity: warning - - alert: LokistackComponentsNotReady + - alert: LokistackComponentsNotReadyWarning annotations: description: |- The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" has components that are not ready. summary: "One or more LokiStack components are not ready." - runbook_url: "[[ .RunbookURL ]]#Lokistack-Pending-Components" + runbook_url: "[[ .RunbookURL ]]#Lokistack-Components-Not-Ready-Warning" expr: | sum ( lokistack_status_condition{reason="ReadyComponents", status="false"} ) by (stack_name, stack_namespace) > 0 - for: 10m + for: 15m labels: - severity: critical + severity: warning diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index 975ee2f880d94..2a1ce7741bee6 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -67,7 +67,7 @@ tests: values: '0x5 0+120x25 3000' - series: 'lokistack_status_condition{stack_name="mystack", stack_namespace="my-ns", reason="ReadyComponents", status="false"}' - values: '0+0x15 1+0x10' + values: '1+0x25' alert_rule_test: - eval_time: 16m @@ -197,14 +197,14 @@ tests: Samples are discarded because of "line_too_long" at a rate of 2 samples per second. summary: Loki is discarding samples during ingestion because they fail validation. runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" - - eval_time: 26m - alertname: LokistackComponentsNotReady + - eval_time: 16m + alertname: LokistackComponentsNotReadyWarning exp_alerts: - exp_labels: stack_namespace: my-ns stack_name: mystack - severity: critical + severity: warning exp_annotations: description: 'The LokiStack "mystack" in namespace "my-ns" has components that are not ready.' summary: "One or more LokiStack components are not ready." - runbook_url: "[[ .RunbookURL ]]#Lokistack-Pending-Components" + runbook_url: "[[ .RunbookURL ]]#Lokistack-Components-Not-Ready-Warning" From 5c02bb1791dd170c827a13d11b6a70d1b6573086 Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Wed, 1 Oct 2025 14:09:00 +0100 Subject: [PATCH 5/5] moved to namespace label --- .../manifests/internal/alerts/prometheus-alerts.yaml | 9 ++++++--- .../manifests/internal/alerts/testdata/test.yaml | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index c3a90af066eb7..860ca000e4210 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -212,13 +212,16 @@ groups: - alert: LokistackComponentsNotReadyWarning annotations: description: |- - The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" has components that are not ready. + The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.namespace }}" has components that are not ready. summary: "One or more LokiStack components are not ready." runbook_url: "[[ .RunbookURL ]]#Lokistack-Components-Not-Ready-Warning" expr: | sum ( - lokistack_status_condition{reason="ReadyComponents", status="false"} - ) by (stack_name, stack_namespace) + label_replace( + lokistack_status_condition{reason="ReadyComponents", status="false"}, + "namespace", "$1", "stack_namespace", "(.+)" + ) + ) by (stack_name, namespace) > 0 for: 15m labels: diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index 2a1ce7741bee6..8984c90be44b9 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -201,7 +201,7 @@ tests: alertname: LokistackComponentsNotReadyWarning exp_alerts: - exp_labels: - stack_namespace: my-ns + namespace: my-ns stack_name: mystack severity: warning exp_annotations: