From 17b52d572a96eb2ed378563b9695f0c501633306 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 3 Apr 2024 10:30:10 +0100 Subject: [PATCH] Use 5m instead 1m range Signed-off-by: Michel Hollands --- .../meta-monitoring/src/rules/loki-rules.yaml | 40 +- .../src/rules/mimir-rules.yaml | 374 +++++++++--------- .../src/rules/tempo-rules.yaml | 12 +- 3 files changed, 205 insertions(+), 221 deletions(-) diff --git a/charts/meta-monitoring/src/rules/loki-rules.yaml b/charts/meta-monitoring/src/rules/loki-rules.yaml index d25cd98..8816cd1 100644 --- a/charts/meta-monitoring/src/rules/loki-rules.yaml +++ b/charts/meta-monitoring/src/rules/loki-rules.yaml @@ -1,53 +1,53 @@ groups: - name: "loki_rules" rules: - - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:loki_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:loki_request_duration_seconds:50quantile" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:loki_request_duration_seconds:avg" - - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)" + - expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)" + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:loki_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)" + - expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:loki_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:loki_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:loki_request_duration_seconds:50quantile" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)" + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:loki_request_duration_seconds:avg" - - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + - expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job, route)" record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)" + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)" record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)" + - expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:loki_request_duration_seconds:avg" - - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + - expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)" record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + - expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate" diff --git a/charts/meta-monitoring/src/rules/mimir-rules.yaml b/charts/meta-monitoring/src/rules/mimir-rules.yaml index 1a565fb..f98ab2b 100644 --- a/charts/meta-monitoring/src/rules/mimir-rules.yaml +++ b/charts/meta-monitoring/src/rules/mimir-rules.yaml @@ -1,322 +1,317 @@ groups: - name: "mimir_api_1" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_request_duration_seconds:avg" - - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)" + - expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_request_duration_seconds_count:sum_rate" - name: "mimir_api_2" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:cortex_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:cortex_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)" + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route) + / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_request_duration_seconds:avg" - - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, + - expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job, route)" record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)" + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)" + - expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate" - name: "mimir_api_3" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace, + job, route) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg" - - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + - expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, + - expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate" - name: "mimir_querier_api" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_querier_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_querier_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, + job) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_querier_request_duration_seconds:avg" - - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job, route))" record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, + job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_querier_request_duration_seconds:avg" - - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, job, route)" record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, job, route)" record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, + namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg" - - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate" - name: "mimir_cache" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method))" record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method))" record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, - job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) + - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster, + job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg" - - expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method)" record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_cache_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_cache_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job) + / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_cache_request_duration_seconds:avg" - - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method))" record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method))" record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, - method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job, + method) / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_cache_request_duration_seconds:avg" - - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster, job, method)" record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job, method)" record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate" - name: "mimir_storage" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_kv_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_kv_request_duration_seconds:50quantile" - - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job) + / sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_kv_request_duration_seconds:avg" - - expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate" - name: "mimir_queries" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_query_frontend_retries:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_query_frontend_retries:50quantile" - - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) + - expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_retries:avg" - - expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)" + - expr: "sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate" - - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate" - - expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_retries_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile" - - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster, + job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg" - - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate" - - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster, job)" record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate" - name: "mimir_ingester_queries" rules: - - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_series:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_series:50quantile" - - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) + - expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_series:avg" - - expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_series_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_samples:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_samples:50quantile" - - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) + - expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_samples:avg" - - expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate" - - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_exemplars:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster, job))" record: "cluster_job:cortex_ingester_queried_exemplars:50quantile" - - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job) / + sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_exemplars:avg" - - expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, + - expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster, job)" record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate" - - expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)" + - expr: "sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)" record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate" - name: "mimir_received_samples" rules: - - expr: "| - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))" + - expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))" record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m" - name: "mimir_exemplars_in" rules: - - expr: "| - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))" + - expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))" record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m" - name: "mimir_received_exemplars" rules: - - expr: "| - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))" + - expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))" record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m" - name: "mimir_exemplars_ingested" rules: - - expr: "| - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))" + - expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))" record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m" - name: "mimir_exemplars_appended" rules: - - expr: "| - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))" + - expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))" record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m" - name: "mimir_scaling_rules" rules: - - expr: "| + - expr: | # Convenience rule to get the number of replicas for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. sum by (cluster, namespace, deployment) ( label_replace( kube_deployment_spec_replicas, - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) or sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\") - )" + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") + ) record: "cluster_namespace_deployment:actual_replicas:count" - - expr: "| + - expr: | ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( @@ -324,21 +319,21 @@ groups: )[24h:] ) / 240000 - )" + ) labels: deployment: "distributor" reason: "sample_rate" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) * 0.59999999999999998 / 240000 - )" + ) labels: deployment: "distributor" reason: "sample_rate_limits" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( @@ -346,12 +341,12 @@ groups: )[24h:] ) * 3 / 80000 - )" + ) labels: deployment: "ingester" reason: "sample_rate" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( quantile_over_time(0.99, sum by(cluster, namespace) ( @@ -359,59 +354,59 @@ groups: )[24h:] ) / 1500000 - )" + ) labels: deployment: "ingester" reason: "active_series" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) * 3 * 0.59999999999999998 / 1500000 - )" + ) labels: deployment: "ingester" reason: "active_series_limits" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) * 0.59999999999999998 / 80000 - )" + ) labels: deployment: "ingester" reason: "sample_rate_limits" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | ceil( (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"} + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} ) / 4) / avg by (cluster, namespace) ( - memcached_limit_bytes{job=~\".+/memcached\"} + memcached_limit_bytes{job=~".+/memcached"} ) - )" + ) labels: deployment: "memcached" reason: "active_series" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | sum by (cluster, namespace, deployment) ( label_replace( label_replace( - sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[5m])), + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) - )" + ) record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate" - - expr: "| + - expr: | # Convenience rule to get the CPU request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 @@ -424,11 +419,11 @@ groups: label_replace( label_replace( kube_pod_container_resource_requests_cpu_cores, - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -439,17 +434,17 @@ groups: sum by (cluster, namespace, deployment) ( label_replace( label_replace( - kube_pod_container_resource_requests{resource=\"cpu\"}, - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) - )" + ) record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum" - - expr: "| + - expr: | # Jobs should be sized to their CPU usage. # We do this by comparing 99th percentile usage over the last 24hrs to # their current provisioned #replicas and resource requests. @@ -459,28 +454,28 @@ groups: quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - )" + ) labels: reason: "cpu_usage" record: "cluster_namespace_deployment_reason:required_replicas:count" - - expr: "| + - expr: | # Convenience rule to get the Memory utilization for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. sum by (cluster, namespace, deployment) ( label_replace( label_replace( - container_memory_usage_bytes{image!=\"\"}, - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + container_memory_usage_bytes{image!=""}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) - )" + ) record: "cluster_namespace_deployment:container_memory_usage_bytes:sum" - - expr: "| + - expr: | # Convenience rule to get the Memory request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 @@ -493,11 +488,11 @@ groups: label_replace( label_replace( kube_pod_container_resource_requests_memory_bytes, - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -508,17 +503,17 @@ groups: sum by (cluster, namespace, deployment) ( label_replace( label_replace( - kube_pod_container_resource_requests{resource=\"memory\"}, - \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), - # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) - )" + ) record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum" - - expr: "| + - expr: | # Jobs should be sized to their Memory usage. # We do this by comparing 99th percentile usage over the last 24hrs to # their current provisioned #replicas and resource requests. @@ -528,44 +523,33 @@ groups: quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - )" + ) labels: reason: "memory_usage" record: "cluster_namespace_deployment_reason:required_replicas:count" - name: "mimir_alertmanager_rules" rules: - - expr: "| - sum by (cluster, job, pod) (cortex_alertmanager_alerts)" + - expr: "sum by (cluster, job, pod) (cortex_alertmanager_alerts)" record: "cluster_job_pod:cortex_alertmanager_alerts:sum" - - expr: "| - sum by (cluster, job, pod) (cortex_alertmanager_silences)" + - expr: "sum by (cluster, job, pod) (cortex_alertmanager_silences)" record: "cluster_job_pod:cortex_alertmanager_silences:sum" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))" record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))" record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m" - - expr: "| - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))" + - expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))" record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m" - - expr: "| - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))" + - expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))" record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))" record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))" record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))" record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m" - - expr: "| - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))" + - expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))" record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m" - name: "mimir_ingester_rules" rules: - - expr: "| - sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))" + - expr: "sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[5m]))" record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m" diff --git a/charts/meta-monitoring/src/rules/tempo-rules.yaml b/charts/meta-monitoring/src/rules/tempo-rules.yaml index 27ac873..f3c33af 100644 --- a/charts/meta-monitoring/src/rules/tempo-rules.yaml +++ b/charts/meta-monitoring/src/rules/tempo-rules.yaml @@ -1,15 +1,15 @@ groups: - name: "tempo_rules" rules: - - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))" record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile" - - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg" - - expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"