From 60b0feebac56972e826879ac9f15f547a67da73d Mon Sep 17 00:00:00 2001 From: Eugene Jahn Date: Tue, 5 May 2026 15:26:15 -0400 Subject: [PATCH 1/5] monitoring: add consolidated workloads CPU/memory dashboard Adds a single GCP Monitoring dashboard that surfaces CPU and memory across all Sigstore GKE workloads (grouped by namespace / container), so oncall does not have to navigate multiple metric pages while investigating resource issues. The dashboard includes: - CPU usage in cores (rate of core_usage_time) - Memory used (non-evictable bytes) - CPU/memory limit utilization (REDUCE_MAX so a hot replica is visible) - CPU/memory request utilization (REDUCE_MAX) - Container restart deltas - Node CPU allocatable utilization Resolves sigstore/public-good-instance#1122 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Eugene Jahn --- gcp/modules/monitoring/infra/dashboards.tf | 8 + gcp/modules/monitoring/infra/workloads.json | 350 ++++++++++++++++++++ 2 files changed, 358 insertions(+) create mode 100644 gcp/modules/monitoring/infra/workloads.json diff --git a/gcp/modules/monitoring/infra/dashboards.tf b/gcp/modules/monitoring/infra/dashboards.tf index a5070b30..f7e5f8a0 100644 --- a/gcp/modules/monitoring/infra/dashboards.tf +++ b/gcp/modules/monitoring/infra/dashboards.tf @@ -72,3 +72,11 @@ resource "google_monitoring_dashboard" "rekor_v1" { rekor_url = var.rekor_url }) } + +# Consolidated CPU/memory dashboard for all Sigstore GKE workloads. +# See https://github.com/sigstore/public-good-instance/issues/1122 +resource "google_monitoring_dashboard" "workloads" { + project = var.project_id + + dashboard_json = file("${path.module}/workloads.json") +} diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json new file mode 100644 index 00000000..5407d41c --- /dev/null +++ b/gcp/modules/monitoring/infra/workloads.json @@ -0,0 +1,350 @@ +{ + "displayName": "Workloads CPU & Memory", + "mosaicLayout": { + "columns": 12, + "tiles": [ + { + "xPos": 0, + "yPos": 0, + "width": 12, + "height": 4, + "widget": { + "title": "Overview", + "text": { + "content": "Consolidated CPU and memory view for all Sigstore GKE workloads (Fulcio, Rekor, CTLog, Trillian, Dex, prober, monitoring, etc.). Charts are grouped by `namespace` / `container_name`. Use this dashboard as the first stop when investigating high resource usage during oncall.\n\nMetric source: GKE container metrics (`kubernetes.io/container/*`).", + "format": "MARKDOWN", + "style": { + "fontSize": "FS_LARGE", + "padding": "P_EXTRA_SMALL" + } + } + } + }, + { + "xPos": 0, + "yPos": 4, + "width": 6, + "height": 16, + "widget": { + "title": "CPU usage (cores) by container", + "xyChart": { + "yAxis": { + "label": "cores", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "1" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 4, + "width": 6, + "height": 16, + "widget": { + "title": "Memory used (bytes) by container", + "xyChart": { + "yAxis": { + "label": "bytes", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "By" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 20, + "width": 6, + "height": 16, + "widget": { + "title": "CPU limit utilization (% of container limit)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.8, + "color": "YELLOW", + "direction": "ABOVE" + }, + { + "value": 0.95, + "color": "RED", + "direction": "ABOVE" + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/limit_utilization\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 20, + "width": 6, + "height": 16, + "widget": { + "title": "Memory limit utilization (% of container limit)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.8, + "color": "YELLOW", + "direction": "ABOVE" + }, + { + "value": 0.95, + "color": "RED", + "direction": "ABOVE" + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/limit_utilization\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 36, + "width": 6, + "height": 16, + "widget": { + "title": "CPU request utilization (% of container request)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/request_utilization\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 36, + "width": 6, + "height": 16, + "widget": { + "title": "Memory request utilization (% of container request)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/request_utilization\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 52, + "width": 6, + "height": 16, + "widget": { + "title": "Container restarts (delta, 5m)", + "xyChart": { + "yAxis": { + "label": "restarts", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "STACKED_BAR", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/restart_count\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "300s", + "perSeriesAligner": "ALIGN_DELTA", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + } + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 52, + "width": 6, + "height": 16, + "widget": { + "title": "Node CPU allocatable utilization", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.9, + "color": "RED", + "direction": "ABOVE" + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.node_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/node/cpu/allocatable_utilization\" resource.type=\"k8s_node\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MEAN", + "groupByFields": [ + "resource.label.node_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + } + ] + } +} \ No newline at end of file From f9ed726d50d4b34d9c60eba15ab0862a82326835 Mon Sep 17 00:00:00 2001 From: Eugene Jahn Date: Wed, 6 May 2026 14:00:26 -0400 Subject: [PATCH 2/5] monitoring: drop unsupported threshold color/direction fields The xyChart threshold schema does not accept color/direction for these chart types; the dashboard create rejects them. Keep just the value. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Eugene Jahn --- gcp/modules/monitoring/infra/workloads.json | 22 ++++++--------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json index 5407d41c..94cfd204 100644 --- a/gcp/modules/monitoring/infra/workloads.json +++ b/gcp/modules/monitoring/infra/workloads.json @@ -108,14 +108,10 @@ }, "thresholds": [ { - "value": 0.8, - "color": "YELLOW", - "direction": "ABOVE" + "value": 0.8 }, { - "value": 0.95, - "color": "RED", - "direction": "ABOVE" + "value": 0.95 } ], "dataSets": [ @@ -157,14 +153,10 @@ }, "thresholds": [ { - "value": 0.8, - "color": "YELLOW", - "direction": "ABOVE" + "value": 0.8 }, { - "value": 0.95, - "color": "RED", - "direction": "ABOVE" + "value": 0.95 } ], "dataSets": [ @@ -316,9 +308,7 @@ }, "thresholds": [ { - "value": 0.9, - "color": "RED", - "direction": "ABOVE" + "value": 0.9 } ], "dataSets": [ @@ -347,4 +337,4 @@ } ] } -} \ No newline at end of file +} From 939aeede2bcbbd49c3af73c9d1ff53c561fce775 Mon Sep 17 00:00:00 2001 From: Eugene Jahn Date: Wed, 6 May 2026 14:59:41 -0400 Subject: [PATCH 3/5] monitoring: fix workloads dashboard tile sizing Heights of 16 in a 12-column mosaic produced very tall narrow tiles. Use h=4 (standard) for charts and keep h=4 for the overview banner. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Eugene Jahn --- gcp/modules/monitoring/infra/workloads.json | 28 ++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json index 94cfd204..707151ff 100644 --- a/gcp/modules/monitoring/infra/workloads.json +++ b/gcp/modules/monitoring/infra/workloads.json @@ -24,7 +24,7 @@ "xPos": 0, "yPos": 4, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "CPU usage (cores) by container", "xyChart": { @@ -61,7 +61,7 @@ "xPos": 6, "yPos": 4, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "Memory used (bytes) by container", "xyChart": { @@ -96,9 +96,9 @@ }, { "xPos": 0, - "yPos": 20, + "yPos": 8, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "CPU limit utilization (% of container limit)", "xyChart": { @@ -141,9 +141,9 @@ }, { "xPos": 6, - "yPos": 20, + "yPos": 8, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "Memory limit utilization (% of container limit)", "xyChart": { @@ -186,9 +186,9 @@ }, { "xPos": 0, - "yPos": 36, + "yPos": 12, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "CPU request utilization (% of container request)", "xyChart": { @@ -223,9 +223,9 @@ }, { "xPos": 6, - "yPos": 36, + "yPos": 12, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "Memory request utilization (% of container request)", "xyChart": { @@ -260,9 +260,9 @@ }, { "xPos": 0, - "yPos": 52, + "yPos": 16, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "Container restarts (delta, 5m)", "xyChart": { @@ -296,9 +296,9 @@ }, { "xPos": 6, - "yPos": 52, + "yPos": 16, "width": 6, - "height": 16, + "height": 4, "widget": { "title": "Node CPU allocatable utilization", "xyChart": { From bc09031151c12034e660c067be37c02d0d17707c Mon Sep 17 00:00:00 2001 From: Eugene Jahn Date: Thu, 7 May 2026 10:46:49 -0400 Subject: [PATCH 4/5] monitoring: add network, ephemeral storage, node memory, and uptime tiles to workloads dashboard Mirrors the standard GKE Workloads dashboard so oncall does not have to navigate to multiple pages to find resource usage charts: - Pod network received / sent (per namespace) - Ephemeral storage used (per container) - Node memory allocatable utilization (sibling of node CPU) - Running containers per namespace (uptime count) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Eugene Jahn --- gcp/modules/monitoring/infra/workloads.json | 185 ++++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json index 707151ff..311dd1b4 100644 --- a/gcp/modules/monitoring/infra/workloads.json +++ b/gcp/modules/monitoring/infra/workloads.json @@ -334,6 +334,191 @@ ] } } + }, + { + "xPos": 0, + "yPos": 20, + "width": 6, + "height": 4, + "widget": { + "title": "Node memory allocatable utilization", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.9 + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.node_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/node/memory/allocatable_utilization\" resource.type=\"k8s_node\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MEAN", + "groupByFields": [ + "resource.label.node_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 20, + "width": 6, + "height": 4, + "widget": { + "title": "Ephemeral storage used (bytes) by container", + "xyChart": { + "yAxis": { + "label": "bytes", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/ephemeral_storage/used_bytes\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "By" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 24, + "width": 6, + "height": 4, + "widget": { + "title": "Pod network received (bytes/s) by namespace", + "xyChart": { + "yAxis": { + "label": "bytes/s", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + }, + "unitOverride": "By/s" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 24, + "width": 6, + "height": 4, + "widget": { + "title": "Pod network sent (bytes/s) by namespace", + "xyChart": { + "yAxis": { + "label": "bytes/s", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + }, + "unitOverride": "By/s" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 28, + "width": 12, + "height": 4, + "widget": { + "title": "Running containers per namespace (uptime samples)", + "xyChart": { + "yAxis": { + "label": "containers", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "STACKED_AREA", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/uptime\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_COUNT", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + } + } + } + ] + } + } } ] } From 0c4b7c1376da058ee4b3baf68827f5fb22669aec Mon Sep 17 00:00:00 2001 From: Eugene Jahn Date: Thu, 7 May 2026 10:52:42 -0400 Subject: [PATCH 5/5] monitoring: count running containers exactly via REDUCE_COUNT Previous tile used ALIGN_COUNT + REDUCE_SUM, which sums sample counts within the alignment window and is an approximation of container count. Switch to ALIGN_MEAN per series + REDUCE_COUNT across series so the y-axis is the exact number of running containers per namespace. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Eugene Jahn --- gcp/modules/monitoring/infra/workloads.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json index 311dd1b4..3c99fd54 100644 --- a/gcp/modules/monitoring/infra/workloads.json +++ b/gcp/modules/monitoring/infra/workloads.json @@ -491,7 +491,7 @@ "width": 12, "height": 4, "widget": { - "title": "Running containers per namespace (uptime samples)", + "title": "Running containers per namespace", "xyChart": { "yAxis": { "label": "containers", @@ -507,8 +507,8 @@ "filter": "metric.type=\"kubernetes.io/container/uptime\" resource.type=\"k8s_container\"", "aggregation": { "alignmentPeriod": "60s", - "perSeriesAligner": "ALIGN_COUNT", - "crossSeriesReducer": "REDUCE_SUM", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_COUNT", "groupByFields": [ "resource.label.namespace_name" ]