ROX-17469: implemented sli/alerts for central api latencies

pepedocs · pepedocs · commit 560fdc608293 · 2023-07-27T09:52:06.000+10:00
fixed conflicts
diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml
@@ -482,6 +482,28 @@ spec:
             (1 - central:sli:availability:extended_avg_over_time1h) / (1 - scalar(central:slo:availability))
           record: central:slo:availability:burnrate1h
 
+    - name: rhacs-central-grpc-api-latency.sli
+      rules:
+        # The 90th percentile value of central's handled GRPC API requests latencies for the last 10 minutes.
+        # These do not include long-running and synchronous GRPC APIs.
+        - expr: |
+            histogram_quantile(0.9, sum  by(le, namespace) (rate(grpc_server_handling_seconds_bucket{container="central", grpc_method!~"ScanImageInternal|DeleteImages|EnrichLocalImageInternal|RunReport|ScanImage|TriggerExternalBackup"}[10m])))
+          record: central:grpc_server_handling_seconds:rate10m:quantile90
+
+    - name: rhacs-central-http-api-latency.sli
+      rules:
+        # The 90th percentile value of central's handled HTTP API request latencies for the last 10 minutes.
+        - expr: |
+            histogram_quantile(0.9, sum by(le, namespace) (rate(http_incoming_request_duration_histogram_seconds_bucket{container="central"}[10m])))
+          record: central:http_incoming_request_duration_seconds:rate10m:quantile90
+
+    - name: rhacs-central-graphql-api-latency.sli
+      rules:
+        # The 90th percentile value of central's handled GraphQL API request latencies for the last 10 minutes.
+        - expr: |
+            histogram_quantile(0.9, sum by(namespace, le) (rate(rox_central_graphql_query_duration_bucket{container="central"}[10m])))
+          record: central:rox_central_graphql_query_duration:rate10m:quantile90
+
     - name: rhacs-central.alerts
       rules:
         - alert: Central availability error budget exhaustion - 90%
@@ -533,6 +555,32 @@ spec:
             severity: critical
             namespace: "{{ $labels.namespace }}"
             rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
+<<<<<<< HEAD
+=======
+            rhacs_org_name: "{{ $labels.rhacs_org_name }}"
+            rhacs_org_id: "{{ $labels.rhacs_org_id }}"
+            rhacs_cluster_name: "{{ $labels.rhacs_cluster_name }}"
+            rhacs_environment: "{{ $labels.rhacs_environment }}"
+
+        - alert: The 90th percentile of GRPC request latencies is greater than 100ms
+          expr: |
+            count(central:grpc_server_handling_seconds:rate10m:quantile90 > 0.1) > 0
+          annotations:
+            message: "The 90th percentile of grpc request latencies is greater than 100ms."
+
+        - alert: The 90th percentile of HTTP request latencies is greater than 100ms.
+          expr: |
+            count(central:http_incoming_request_duration_seconds:rate10m:quantile90 > 0.1) > 0
+          annotations:
+            message: "The 90th percentile of HTTP request latencies is greater than 100ms."
+
+        - alert: The 90th percentile of GraphQL request latencies is greater than 100ms.
+          expr: |
+            count(central:rox_central_graphql_query_duration:rate10m:quantile90 > 0.1) > 0
+          annotations:
+            message: "The 90th percentile of GraphQL request latencies is greater than 100ms."
+
+>>>>>>> 0182f2d (ROX-17469: implemented sli/alerts for central api latencies)
     - name: az-resources
       rules:
         - record: strictly_worker_nodes
diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
@@ -6,6 +6,9 @@ evaluation_interval: 30s
 group_eval_order:
   - rhacs-central.sli
   - rhacs-central.slo
+  - rhacs-central-grpc-api-latency.sli
+  - rhacs-central-http-api-latency.sli
+  - rhacs-central-graphql-api-latency.sli
   - rhacs-central.alerts
 
 tests:
@@ -186,3 +189,58 @@ tests:
             exp_annotations:
               message: "High availability burn rate for central. Current burn rate per hour: 59.17."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
+
+
+  # Test the alert -  "The 90th percentile of GRPC/HTTP/GraphQL request latencies is greater than 100ms."
+  - interval: 1m
+    input_series:
+      - series: http_incoming_request_duration_histogram_seconds_bucket{container="central", namespace="rhacs-abc", le="10"}
+        values: "1+1x10"
+      - series: http_incoming_request_duration_histogram_seconds_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
+        values: "1+1x10"
+      - series: grpc_server_handling_seconds_bucket{container="central", namespace="rhacs-abc", le="10"}
+        values: "1+1x10"
+      - series: grpc_server_handling_seconds_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
+        values: "1+1x10"
+      - series: rox_central_graphql_query_duration_bucket{container="central", namespace="rhacs-abc", le="10"}
+        values: "1+1x10"
+      - series: rox_central_graphql_query_duration_bucket{container="central", namespace="rhacs-abc", le="+Inf"}
+        values: "1+1x10"
+    # Rather than generating the right histogram samples and determining Prometheus's histogram
+    # quantile estimation, we test the alerts' expression components as this is much simpler.
+    promql_expr_test:
+      # GRPC
+      - expr: central:grpc_server_handling_seconds:rate10m:quantile90
+        eval_time: 10m
+        exp_samples:
+          # The following value was obtained by trial-end-error by running the expression.
+          - labels: '{__name__="central:grpc_server_handling_seconds:rate10m:quantile90", namespace="rhacs-abc"}'
+            value: 9
+      - expr: count by(namespace) (central:grpc_server_handling_seconds:rate10m:quantile90 > 0.1)
+        eval_time: 10m
+        exp_samples:
+          - value: 1
+            labels: '{namespace="rhacs-abc"}'
+      # HTTP
+      - expr: central:http_incoming_request_duration_seconds:rate10m:quantile90
+        eval_time: 10m
+        exp_samples:
+          # The following value was obtained by trial-end-error by running the expression.
+          - value: 9
+            labels: '{__name__="central:http_incoming_request_duration_seconds:rate10m:quantile90", namespace="rhacs-abc"}'
+      - expr: count by(namespace) (central:http_incoming_request_duration_seconds:rate10m:quantile90 > 0.1)
+        eval_time: 10m
+        exp_samples:
+          - value: 1
+            labels: '{namespace="rhacs-abc"}'
+      - expr: central:rox_central_graphql_query_duration:rate10m:quantile90
+        eval_time: 10m
+        exp_samples:
+          # The following value was obtained by trial-end-error by running the expression.
+          - value: 9
+            labels: '{__name__="central:rox_central_graphql_query_duration:rate10m:quantile90", namespace="rhacs-abc"}'
+      - expr: count by(namespace) (central:rox_central_graphql_query_duration:rate10m:quantile90 > 0.1)
+        eval_time: 10m
+        exp_samples:
+          - value: 1
+            labels: '{namespace="rhacs-abc"}'