diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index ca96fcc7d0a6e..b032c7c3e0687 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -411,3 +411,48 @@ The schema configuration does not contain the most recent schema version and nee ### Steps - Add a new object storage schema V13 with a future EffectiveDate + +## Lokistack Components Not Ready Warning + +### Impact + +One or more LokiStack components are not ready, which can disrupt ingestion or querying and lead to degraded service. + +### Summary + +The LokiStack reports that some components have not reached the `Ready` state. This might be related to Kubernetes resources (Pods/Deployments), configuration, or external dependencies. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit or view access in the namespace where the LokiStack is deployed: + - OpenShift + - `openshift-logging` (LokiStack) + +### Steps + +- Inspect the LokiStack conditions and events + - Describe the LokiStack resource and review status conditions: + - `kubectl -n describe lokistack ` + - Check for conditions that would lead to some pods not being in the `Ready` state +- Check operator and reconciliation status + - Ensure the Loki Operator is running and not reporting errors: + - `kubectl -n logs deploy/loki-operator-controller-manager` + - Look for reconcile errors related to missing permissions, invalid fields, or failed rollouts. +- Verify component Pods and Deployments + - Ensure all core components are running and Ready in the LokiStack namespace: + - `distributor`, `ingester`, `querier`, `query-frontend`, `index-gateway`, `compactor`, `gateway` + - Check Pod readiness and recent restarts: + - `kubectl -n get pods` + - `kubectl -n describe pod ` +- Examine Kubernetes events for failures + - `kubectl -n get events --sort-by=.lastTimestamp` + - Common causes: image pull backoffs, failed mounts, readiness probe failures, or insufficient resources +- Validate configuration and referenced resources + - Confirm referenced `Secrets` and `ConfigMaps` exist and have correct keys +- Look into the Pod logs of the component that still not `Ready`: + - `kubectl -n logs ` \ No newline at end of file diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index c9b78fb83b111..a4f5298316815 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -227,3 +227,20 @@ groups: for: 1m labels: severity: warning + - alert: LokistackComponentsNotReadyWarning + annotations: + description: |- + The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.namespace }}" has components that are not ready. + summary: "One or more LokiStack components are not ready." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Components-Not-Ready-Warning" + expr: | + sum ( + label_replace( + lokistack_status_condition{reason="ReadyComponents", status="false"}, + "namespace", "$1", "stack_namespace", "(.+)" + ) + ) by (stack_name, namespace) + > 0 + for: 15m + labels: + severity: warning diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index 16d174bc73cfe..0af360273fb8c 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -66,6 +66,9 @@ tests: - series: 'loki_discarded_samples_total{namespace="my-ns", tenant="application", reason="line_too_long"}' values: '0x5 0+120x25 3000' + - series: 'lokistack_status_condition{stack_name="mystack", stack_namespace="my-ns", reason="ReadyComponents", status="false"}' + values: '1+0x25' + - series: 'loki_ingester_chunks_flush_failures_total{namespace="my-ns", pod="ingester-0"}' values: '0+25x20' - series: 'loki_ingester_chunks_flush_requests_total{namespace="my-ns", pod="ingester-0"}' @@ -200,6 +203,17 @@ tests: summary: Loki is discarding samples during ingestion because they fail validation. runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" - eval_time: 16m + alertname: LokistackComponentsNotReadyWarning + exp_alerts: + - exp_labels: + namespace: my-ns + stack_name: mystack + severity: warning + exp_annotations: + description: 'The LokiStack "mystack" in namespace "my-ns" has components that are not ready.' + summary: "One or more LokiStack components are not ready." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Components-Not-Ready-Warning" + - eval_time: 16m alertname: LokiIngesterFlushFailureRateCritical exp_alerts: - exp_labels: