Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 560fdc6

Browse files
committedJul 26, 2023
ROX-17469: implemented sli/alerts for central api latencies
fixed conflicts
1 parent 33c2f15 commit 560fdc6

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed
 

‎resources/prometheus/prometheus-rules.yaml

+48
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,28 @@ spec:
482482
(1 - central:sli:availability:extended_avg_over_time1h) / (1 - scalar(central:slo:availability))
483483
record: central:slo:availability:burnrate1h
484484
485+
- name: rhacs-central-grpc-api-latency.sli
486+
rules:
487+
# The 90th percentile value of central's handled GRPC API requests latencies for the last 10 minutes.
488+
# These do not include long-running and synchronous GRPC APIs.
489+
- expr: |
490+
histogram_quantile(0.9, sum by(le, namespace) (rate(grpc_server_handling_seconds_bucket{container="central", grpc_method!~"ScanImageInternal|DeleteImages|EnrichLocalImageInternal|RunReport|ScanImage|TriggerExternalBackup"}[10m])))
491+
record: central:grpc_server_handling_seconds:rate10m:quantile90
492+
493+
- name: rhacs-central-http-api-latency.sli
494+
rules:
495+
# The 90th percentile value of central's handled HTTP API request latencies for the last 10 minutes.
496+
- expr: |
497+
histogram_quantile(0.9, sum by(le, namespace) (rate(http_incoming_request_duration_histogram_seconds_bucket{container="central"}[10m])))
498+
record: central:http_incoming_request_duration_seconds:rate10m:quantile90
499+
500+
- name: rhacs-central-graphql-api-latency.sli
501+
rules:
502+
# The 90th percentile value of central's handled GraphQL API request latencies for the last 10 minutes.
503+
- expr: |
504+
histogram_quantile(0.9, sum by(namespace, le) (rate(rox_central_graphql_query_duration_bucket{container="central"}[10m])))
505+
record: central:rox_central_graphql_query_duration:rate10m:quantile90
506+
485507
- name: rhacs-central.alerts
486508
rules:
487509
- alert: Central availability error budget exhaustion - 90%
@@ -533,6 +555,32 @@ spec:
533555
severity: critical
534556
namespace: "{{ $labels.namespace }}"
535557
rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
558+
<<<<<<< HEAD
559+
=======
560+
rhacs_org_name: "{{ $labels.rhacs_org_name }}"
561+
rhacs_org_id: "{{ $labels.rhacs_org_id }}"
562+
rhacs_cluster_name: "{{ $labels.rhacs_cluster_name }}"
563+
rhacs_environment: "{{ $labels.rhacs_environment }}"
564+
565+
- alert: The 90th percentile of GRPC request latencies is greater than 100ms
566+
expr: |
567+
count(central:grpc_server_handling_seconds:rate10m:quantile90 > 0.1) > 0
568+
annotations:
569+
message: "The 90th percentile of grpc request latencies is greater than 100ms."
570+
571+
- alert: The 90th percentile of HTTP request latencies is greater than 100ms.
572+
expr: |
573+
count(central:http_incoming_request_duration_seconds:rate10m:quantile90 > 0.1) > 0
574+
annotations:
575+
message: "The 90th percentile of HTTP request latencies is greater than 100ms."
576+
577+
- alert: The 90th percentile of GraphQL request latencies is greater than 100ms.
578+
expr: |
579+
count(central:rox_central_graphql_query_duration:rate10m:quantile90 > 0.1) > 0
580+
annotations:
581+
message: "The 90th percentile of GraphQL request latencies is greater than 100ms."
582+
583+
>>>>>>> 0182f2d (ROX-17469: implemented sli/alerts for central api latencies)
536584
- name: az-resources
537585
rules:
538586
- record: strictly_worker_nodes

‎resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml

+58
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ evaluation_interval: 30s
66
group_eval_order:
77
- rhacs-central.sli
88
- rhacs-central.slo
9+
- rhacs-central-grpc-api-latency.sli
10+
- rhacs-central-http-api-latency.sli
11+
- rhacs-central-graphql-api-latency.sli
912
- rhacs-central.alerts
1013

1114
tests:
@@ -186,3 +189,58 @@ tests:
186189
exp_annotations:
187190
message: "High availability burn rate for central. Current burn rate per hour: 59.17."
188191
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
192+
193+
194+
# Test the alert - "The 90th percentile of GRPC/HTTP/GraphQL request latencies is greater than 100ms."
195+
- interval: 1m
196+
input_series:
197+
- series: http_incoming_request_duration_histogram_seconds_bucket{container="central", namespace="rhacs-abc", le="10"}
198+
values: "1+1x10"
199+
- series: http_incoming_request_duration_histogram_seconds_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
200+
values: "1+1x10"
201+
- series: grpc_server_handling_seconds_bucket{container="central", namespace="rhacs-abc", le="10"}
202+
values: "1+1x10"
203+
- series: grpc_server_handling_seconds_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
204+
values: "1+1x10"
205+
- series: rox_central_graphql_query_duration_bucket{container="central", namespace="rhacs-abc", le="10"}
206+
values: "1+1x10"
207+
- series: rox_central_graphql_query_duration_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
208+
values: "1+1x10"
209+
# Rather than generating the right histogram samples and determining Prometheus's histogram
210+
# quantile estimation, we test the alerts' expression components as this is much simpler.
211+
promql_expr_test:
212+
# GRPC
213+
- expr: central:grpc_server_handling_seconds:rate10m:quantile90
214+
eval_time: 10m
215+
exp_samples:
216+
# The following value was obtained by trial-end-error by running the expression.
217+
- labels: '{__name__="central:grpc_server_handling_seconds:rate10m:quantile90", namespace="rhacs-abc"}'
218+
value: 9
219+
- expr: count by(namespace) (central:grpc_server_handling_seconds:rate10m:quantile90 > 0.1)
220+
eval_time: 10m
221+
exp_samples:
222+
- value: 1
223+
labels: '{namespace="rhacs-abc"}'
224+
# HTTP
225+
- expr: central:http_incoming_request_duration_seconds:rate10m:quantile90
226+
eval_time: 10m
227+
exp_samples:
228+
# The following value was obtained by trial-end-error by running the expression.
229+
- value: 9
230+
labels: '{__name__="central:http_incoming_request_duration_seconds:rate10m:quantile90", namespace="rhacs-abc"}'
231+
- expr: count by(namespace) (central:http_incoming_request_duration_seconds:rate10m:quantile90 > 0.1)
232+
eval_time: 10m
233+
exp_samples:
234+
- value: 1
235+
labels: '{namespace="rhacs-abc"}'
236+
- expr: central:rox_central_graphql_query_duration:rate10m:quantile90
237+
eval_time: 10m
238+
exp_samples:
239+
# The following value was obtained by trial-end-error by running the expression.
240+
- value: 9
241+
labels: '{__name__="central:rox_central_graphql_query_duration:rate10m:quantile90", namespace="rhacs-abc"}'
242+
- expr: count by(namespace) (central:rox_central_graphql_query_duration:rate10m:quantile90 > 0.1)
243+
eval_time: 10m
244+
exp_samples:
245+
- value: 1
246+
labels: '{namespace="rhacs-abc"}'

0 commit comments

Comments
 (0)
Please sign in to comment.