Skip to content

Commit a292afa

Browse files
Merge pull request #33 from mia-platform/feat/update-prometheus-rules
Update prometheus rules
2 parents 5ce1ce9 + 0c7971f commit a292afa

34 files changed

+873
-650
lines changed

charts/monitoring/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
apiVersion: v2
22
name: monitoring
3-
version: 2.2.1
3+
version: 2.2.2
44
kubeVersion: ">= 1.20.0-0"
5-
appVersion: v0.73.2
5+
appVersion: v0.74.0
66
description: "A Kubernetes monitoring stack based on Prometheus Operator"
77
type: application
88
keywords:

charts/monitoring/templates/default-monitors/alertmanager.yaml

+14-4
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,17 @@ spec:
1313
- {{ .Release.Namespace | quote }}
1414
jobLabel: "app.kubernetes.io/name"
1515
podMetricsEndpoints:
16-
- port: "web"
17-
path: "/metrics"
18-
- port: "reloader-web"
19-
path: "/metrics"
16+
- path: /metrics
17+
port: web
18+
metricRelabelings:
19+
- action: keep
20+
regex: alertmanager.*
21+
sourceLabels:
22+
- __name__
23+
- path: /metrics
24+
port: reloader-web
25+
metricRelabelings:
26+
- action: keep
27+
regex: reloader.*
28+
sourceLabels:
29+
- __name__

charts/monitoring/templates/default-monitors/dns.yaml

+13-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,19 @@ spec:
1111
k8s-app: "kube-dns"
1212
namespaceSelector:
1313
matchNames:
14-
- "kube-system"
14+
- kube-system
1515
jobLabel: "k8s-app"
1616
podMetricsEndpoints:
17-
- port: "metrics"
17+
- path: /metrics
18+
port: metrics
19+
metricRelabelings:
20+
- action: keep
21+
regex: (coredns.*|skydns.*|kubedns.*)
22+
sourceLabels:
23+
- __name__
24+
- action: drop
25+
# deprecated metrics, don't count on them
26+
regex: coredns_cache_misses_total
27+
sourceLabels:
28+
- __name__
1829
{{- end }}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: ServiceMonitor
3+
metadata:
4+
name: {{ include "mia-monitoring.kubelet.fullname" . | quote }}
5+
labels:
6+
{{- include "mia-monitoring.labels" . | nindent 4 }}
7+
spec:
8+
selector:
9+
matchLabels:
10+
app.kubernetes.io/name: kubelet
11+
namespaceSelector:
12+
matchNames:
13+
- kube-system
14+
jobLabel: "app.kubernetes.io/name"
15+
endpoints:
16+
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
17+
honorLabels: true
18+
interval: 30s
19+
metricRelabelings:
20+
- action: drop
21+
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
22+
sourceLabels:
23+
- __name__
24+
- action: drop
25+
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
26+
sourceLabels:
27+
- __name__
28+
- action: drop
29+
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes)
30+
sourceLabels:
31+
- __name__
32+
- action: drop
33+
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
34+
sourceLabels:
35+
- __name__
36+
- action: drop
37+
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
38+
sourceLabels:
39+
- __name__
40+
- action: drop
41+
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
42+
sourceLabels:
43+
- __name__
44+
- action: drop
45+
regex: transformation_(transformation_latencies_microseconds|failures_total)
46+
sourceLabels:
47+
- __name__
48+
- action: drop
49+
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
50+
sourceLabels:
51+
- __name__
52+
port: https-metrics
53+
relabelings:
54+
- action: replace
55+
sourceLabels:
56+
- __metrics_path__
57+
targetLabel: metrics_path
58+
scheme: https
59+
tlsConfig:
60+
insecureSkipVerify: true
61+
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
62+
honorLabels: true
63+
honorTimestamps: false
64+
interval: 30s
65+
metricRelabelings:
66+
- action: drop
67+
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
68+
sourceLabels:
69+
- __name__
70+
- action: drop
71+
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
72+
sourceLabels:
73+
- __name__
74+
- pod
75+
- namespace
76+
- action: drop
77+
regex: (container_blkio_device_usage_total);.+
78+
sourceLabels:
79+
- __name__
80+
- container
81+
path: /metrics/cadvisor
82+
port: https-metrics
83+
relabelings:
84+
- action: replace
85+
sourceLabels:
86+
- __metrics_path__
87+
targetLabel: metrics_path
88+
scheme: https
89+
tlsConfig:
90+
insecureSkipVerify: true
91+
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
92+
honorLabels: true
93+
interval: 30s
94+
path: /metrics/probes
95+
port: https-metrics
96+
relabelings:
97+
- action: replace
98+
sourceLabels:
99+
- __metrics_path__
100+
targetLabel: metrics_path
101+
scheme: https
102+
tlsConfig:
103+
insecureSkipVerify: true

charts/monitoring/templates/default-monitors/kubestatemerics.yaml

+24-4
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,27 @@ spec:
1313
- {{ .Release.Namespace | quote }}
1414
jobLabel: "app.kubernetes.io/name"
1515
podMetricsEndpoints:
16-
- port: "web"
17-
honorLabels: true
18-
- port: "metrics"
19-
honorLabels: true
16+
- path: /metrics
17+
port: self-metrics
18+
honorLabels: true
19+
interval: 30s
20+
metricRelabelings:
21+
- action: keep
22+
regex: kube.*
23+
sourceLabels:
24+
- __name__
25+
- path: /metrics
26+
port: k8s-metrics
27+
interval: 30s
28+
relabelings:
29+
- action: labeldrop
30+
regex: (pod|service|endpoint|namespace)
31+
metricRelabelings:
32+
- action: drop
33+
regex: (kube_endpoint_address_not_ready|kube_endpoint_address_available)
34+
sourceLabels:
35+
- __name__
36+
- action: keep
37+
regex: kube.*
38+
sourceLabels:
39+
- __name__

charts/monitoring/templates/default-monitors/nodeexporter.yaml

+14-2
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,17 @@ spec:
1313
- {{ .Release.Namespace | quote }}
1414
jobLabel: "app.kubernetes.io/name"
1515
podMetricsEndpoints:
16-
- port: "metrics"
17-
scrapeTimeout: "10s"
16+
- path: /metrics
17+
port: metrics
18+
relabelings:
19+
- action: replace
20+
regex: (.*)
21+
replacement: $1
22+
sourceLabels:
23+
- __meta_kubernetes_pod_node_name
24+
targetLabel: instance
25+
metricRelabelings:
26+
- action: keep
27+
regex: node.*
28+
sourceLabels:
29+
- __name__

charts/monitoring/templates/default-monitors/operator.yaml

+15-10
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@ spec:
1313
- {{ .Release.Namespace | quote }}
1414
jobLabel: "app.kubernetes.io/name"
1515
endpoints:
16-
- port: "websecure"
17-
scheme: "https"
18-
tlsConfig:
19-
serverName: {{ include "mia-monitoring.fullname" . | quote }}
20-
ca:
21-
secret:
22-
name: {{ include "mia-monitoring.tlsSecretName" . | quote }}
23-
key: ca
24-
optional: false
25-
honorLabels: true
16+
- port: websecure
17+
path: /metrics
18+
scheme: https
19+
tlsConfig:
20+
serverName: {{ include "mia-monitoring.fullname" . | quote }}
21+
ca:
22+
secret:
23+
name: {{ include "mia-monitoring.tlsSecretName" . | quote }}
24+
key: ca
25+
optional: false
26+
metricRelabelings:
27+
- action: keep
28+
regex: prometheus_operator.*
29+
sourceLabels:
30+
- __name__

charts/monitoring/templates/default-monitors/prometheus.yaml

+14-4
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,17 @@ spec:
1313
- {{ .Release.Namespace | quote }}
1414
jobLabel: "app.kubernetes.io/name"
1515
podMetricsEndpoints:
16-
- port: "web"
17-
path: "/metrics"
18-
- port: "reloader-web"
19-
path: "/metrics"
16+
- path: /metrics
17+
port: web
18+
metricRelabelings:
19+
- action: keep
20+
regex: prometheus.*
21+
sourceLabels:
22+
- __name__
23+
- path: /metrics
24+
port: reloader-web
25+
metricRelabelings:
26+
- action: keep
27+
regex: reloader.*
28+
sourceLabels:
29+
- __name__

0 commit comments

Comments
 (0)