Skip to content

Commit d21d00e

Browse files
committed
Add prometheus Helm chart
This is currently separate due to the ordering of application. If we change the order, this could be included in the main Helm Chart. Signed-off-by: Todd Short <[email protected]> Assisted-by: Gemini (research) Assisted-by: Claude Code (analysis)
1 parent 714f3b7 commit d21d00e

15 files changed

+352
-8
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ test-experimental-e2e: run-internal image-registry prometheus experimental-e2e e
294294
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
295295
prometheus: PROMETHEUS_VERSION := v0.83.0
296296
prometheus: $(KUSTOMIZE) #EXHELP Deploy Prometheus into specified namespace
297-
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION)
297+
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION)
298298

299299
.PHONY: test-extension-developer-e2e
300300
test-extension-developer-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST)

hack/test/install-prometheus.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
#!/bin/bash
22

3+
source ".bingo/variables.env"
4+
35
set -euo pipefail
46

57
help="install-prometheus.sh is used to set up prometheus monitoring for e2e testing.
68
Usage:
7-
install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [KUSTOMIZE] [GIT_VERSION]
9+
install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION]
810
"
911

10-
if [[ "$#" -ne 4 ]]; then
12+
if [[ "$#" -ne 3 ]]; then
1113
echo "Illegal number of arguments passed"
1214
echo "${help}"
1315
exit 1
1416
fi
1517

1618
PROMETHEUS_NAMESPACE="$1"
1719
PROMETHEUS_VERSION="$2"
18-
KUSTOMIZE="$3"
19-
GIT_VERSION="$4"
20+
GIT_VERSION="$3"
2021

2122
TMPDIR="$(mktemp -d)"
2223
trap 'echo "Cleaning up $TMPDIR"; rm -rf "$TMPDIR"' EXIT
@@ -26,16 +27,16 @@ curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operat
2627
curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/${PROMETHEUS_VERSION}/bundle.yaml" > "${TMPDIR}/bundle.yaml"
2728

2829
echo "Patching namespace to ${PROMETHEUS_NAMESPACE}..."
29-
(cd "$TMPDIR" && $KUSTOMIZE edit set namespace "$PROMETHEUS_NAMESPACE")
30+
(cd "$TMPDIR" && ${KUSTOMIZE} edit set namespace "$PROMETHEUS_NAMESPACE")
3031

3132
echo "Applying Prometheus base..."
3233
kubectl apply -k "$TMPDIR" --server-side
3334

3435
echo "Waiting for Prometheus Operator pod to become ready..."
3536
kubectl wait --for=condition=Ready pod -n "$PROMETHEUS_NAMESPACE" -l app.kubernetes.io/name=prometheus-operator
3637

37-
echo "Applying overlay config..."
38-
$KUSTOMIZE build config/overlays/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
38+
echo "Applying prometheus Helm chart..."
39+
${HELM} template prometheus helm/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
3940

4041
echo "Waiting for metrics scraper to become ready..."
4142
kubectl wait --for=create pods -n "$PROMETHEUS_NAMESPACE" prometheus-prometheus-0 --timeout=60s

helm/prometheus/Chart.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v2
2+
name: prometheus
3+
description: A Helm chart of Prometheus resources for OLMv1
4+
5+
# A chart can be either an 'application' or a 'library' chart.
6+
#
7+
# Application charts are a collection of templates that can be packaged into versioned archives
8+
# to be deployed.
9+
#
10+
# Library charts provide useful utilities or functions for the chart developer. They're included as
11+
# a dependency of application charts to inject those utilities and functions into the rendering
12+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
13+
type: application
14+
15+
# This is the chart version. This version number should be incremented each time you make changes
16+
# to the chart and its templates, including the app version.
17+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18+
version: 0.1.0
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRole
4+
metadata:
5+
name: prometheus
6+
rules:
7+
- apiGroups:
8+
- ""
9+
resources:
10+
- nodes
11+
- nodes/metrics
12+
- services
13+
- endpoints
14+
- pods
15+
verbs:
16+
- get
17+
- list
18+
- watch
19+
- apiGroups:
20+
- ""
21+
resources:
22+
- configmaps
23+
verbs:
24+
- get
25+
- apiGroups:
26+
- discovery.k8s.io
27+
resources:
28+
- endpointslices
29+
verbs:
30+
- get
31+
- list
32+
- watch
33+
- apiGroups:
34+
- networking.k8s.io
35+
resources:
36+
- ingresses
37+
verbs:
38+
- get
39+
- list
40+
- watch
41+
- nonResourceURLs:
42+
- /metrics
43+
verbs:
44+
- get
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: prometheus
6+
roleRef:
7+
apiGroup: rbac.authorization.k8s.io
8+
kind: ClusterRole
9+
name: prometheus
10+
subjects:
11+
- kind: ServiceAccount
12+
name: prometheus
13+
namespace: {{ .Values.namespaces.olmv1.name }}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
apiVersion: networking.k8s.io/v1
3+
kind: NetworkPolicy
4+
metadata:
5+
name: prometheus
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
egress:
9+
- {}
10+
ingress:
11+
- {}
12+
podSelector:
13+
matchLabels:
14+
app.kubernetes.io/name: prometheus
15+
policyTypes:
16+
- Egress
17+
- Ingress
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: Prometheus
4+
metadata:
5+
name: prometheus
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
logLevel: debug
9+
ruleSelector: {}
10+
scrapeInterval: 1m
11+
scrapeTimeout: 30s
12+
securityContext:
13+
runAsNonRoot: true
14+
runAsUser: 65534
15+
seccompProfile:
16+
type: RuntimeDefault
17+
serviceAccountName: prometheus
18+
serviceDiscoveryRole: EndpointSlice
19+
serviceMonitorSelector: {}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: controller-alerts
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
groups:
9+
- name: controller-panic
10+
rules:
11+
- alert: reconciler-panic
12+
annotations:
13+
description: controller of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}}
14+
expr: controller_runtime_reconcile_panics_total{} > 0
15+
- alert: webhook-panic
16+
annotations:
17+
description: controller webhook of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}}
18+
expr: controller_runtime_webhook_panics_total{} > 0
19+
- name: resource-usage
20+
rules:
21+
- alert: oom-events
22+
annotations:
23+
description: container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.pod }}`}} experienced OOM event(s); count={{`{{ $value }}`}}
24+
expr: container_oom_events_total > 0
25+
- alert: operator-controller-memory-growth
26+
annotations:
27+
description: 'operator-controller pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
28+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000
29+
for: 5m
30+
keep_firing_for: 1d
31+
- alert: catalogd-memory-growth
32+
annotations:
33+
description: 'catalogd pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
34+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000
35+
for: 5m
36+
keep_firing_for: 1d
37+
- alert: operator-controller-memory-usage
38+
annotations:
39+
description: 'operator-controller pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
40+
expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000
41+
for: 5m
42+
keep_firing_for: 1d
43+
- alert: catalogd-memory-usage
44+
annotations:
45+
description: 'catalogd pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
46+
expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000
47+
for: 5m
48+
keep_firing_for: 1d
49+
- alert: operator-controller-cpu-usage
50+
annotations:
51+
description: 'operator-controller using high cpu resource for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%'
52+
expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
53+
for: 5m
54+
keep_firing_for: 1d
55+
- alert: catalogd-cpu-usage
56+
annotations:
57+
description: 'catalogd using high cpu resources for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%'
58+
expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
59+
for: 5m
60+
keep_firing_for: 1d
61+
- alert: operator-controller-api-call-rate
62+
annotations:
63+
description: 'operator-controller making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec'
64+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
65+
for: 5m
66+
keep_firing_for: 1d
67+
- alert: catalogd-api-call-rate
68+
annotations:
69+
description: 'catalogd making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec'
70+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
71+
for: 5m
72+
keep_firing_for: 1d
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
apiVersion: v1
3+
kind: Secret
4+
metadata:
5+
annotations:
6+
kubernetes.io/service-account.name: prometheus
7+
name: prometheus-metrics-token
8+
namespace: {{ .Values.namespaces.olmv1.name }}
9+
type: kubernetes.io/service-account-token
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
apiVersion: v1
3+
kind: Service
4+
metadata:
5+
name: prometheus-service
6+
namespace: {{ .Values.namespaces.prometheus.name }}
7+
spec:
8+
ports:
9+
- name: web
10+
nodePort: 30900
11+
port: 9090
12+
protocol: TCP
13+
targetPort: web
14+
selector:
15+
prometheus: prometheus
16+
type: NodePort

0 commit comments

Comments
 (0)