diff --git a/.github/actions/run-e2e/action.yaml b/.github/actions/run-e2e/action.yaml index ce60c6b84f8..f7ddc26aee4 100644 --- a/.github/actions/run-e2e/action.yaml +++ b/.github/actions/run-e2e/action.yaml @@ -35,7 +35,9 @@ runs: env: ARTIFACTS_DIR: ${{ runner.temp }}/e2e-artifacts run: | - set -x + set -euExo pipefail + shopt -s inherit_errexit + mkdir "${ARTIFACTS_DIR}" echo "ARTIFACTS_DIR=${ARTIFACTS_DIR}" | tee -a ${GITHUB_ENV} - uses: actions/download-artifact@v3 @@ -45,7 +47,9 @@ runs: - name: Load image shell: bash run: | - set -x + set -euExo pipefail + shopt -s inherit_errexit + unlz4 ~/operatorimage.tar.lz4 - | docker load # docker looses the registry part on save/load docker tag "$( echo "${image_repo_ref}:ci" | sed -E -e 's~[^/]+/(.*)~\1~' )" "${image_repo_ref}:ci" @@ -57,7 +61,9 @@ runs: - name: Install tools shell: bash run: | - set -x + set -euExo pipefail + shopt -s inherit_errexit + go install github.com/mikefarah/yq/v4@v4.6.1 - name: Setup minikube uses: ./go/src/github.com/scylladb/scylla-operator/.github/actions/setup-minikube @@ -67,7 +73,8 @@ runs: env: SCYLLA_OPERATOR_FEATURE_GATES: '${{ inputs.featureGates }}' run: | - set -x + set -euExo pipefail + shopt -s inherit_errexit timeout 10m ./hack/ci-deploy.sh '${{ env.image_repo_ref }}:ci' @@ -83,11 +90,15 @@ runs: shell: bash if: ${{ github.event_name != 'pull_request' }} run: | + set -euExo pipefail + shopt -s inherit_errexit + echo "FLAKE_ATTEMPTS=5" | tee -a ${GITHUB_ENV} - name: Run e2e shell: bash run: | set -euExo pipefail + shopt -s inherit_errexit e2e_timeout_minutes='${{ inputs.baseTimeoutMinutes }}' flake_attempts=0 @@ -96,7 +107,10 @@ runs: e2e_timeout_minutes="$(( ${e2e_timeout_minutes} + ${flake_attempts} * 10 ))" fi - docker run --user="$( id -u ):$( id -g )" --rm \ + user="$( id -u )" + group="$( id -g )" + ingress_address="$( kubectl -n haproxy-ingress get svc haproxy-ingress --template='{{ .spec.clusterIP }}' )" + docker run --user="${user}:${group}" --rm \ --entrypoint=/usr/bin/scylla-operator-tests \ -v="${ARTIFACTS_DIR}:${ARTIFACTS_DIR}:rw" \ -v="${HOME}/.kube/config:/kubeconfig:ro" -e='KUBECONFIG=/kubeconfig' \ @@ -106,13 +120,18 @@ runs: --artifacts-dir="${ARTIFACTS_DIR}" \ --flake-attempts="${flake_attempts}" \ --timeout="${e2e_timeout_minutes}m" \ - --feature-gates='${{ inputs.featureGates }}' + --feature-gates='${{ inputs.featureGates }}' \ + --override-ingress-address="${ingress_address}" \ ${{ inputs.extraArgs }} - name: Dump cluster state if: ${{ always() }} working-directory: ${{ runner.temp }} shell: bash - run: timeout 10m ${{ inputs.repositoryPath }}/hack/ci-gather-artifacts.sh + run: | + set -euExo pipefail + shopt -s inherit_errexit + + timeout 10m ${{ inputs.repositoryPath }}/hack/ci-gather-artifacts.sh - name: Get machine logs and info if: ${{ always() }} working-directory: ${{ runner.temp }}/e2e-artifacts @@ -142,7 +161,9 @@ runs: working-directory: ${{ runner.temp }}/e2e-artifacts shell: bash run: | - set -euEx -o pipefail + set -euExo pipefail + shopt -s inherit_errexit + sudo cat $( ls /var/log/kube-apiserver-audit*.log | sort -n ) > ./kube-apiserver-audit.log jq -s 'group_by(.user.username) | map({"user": .[0].user.username, "total": length, "verbs": (group_by(.verb) | map({"key":.[0].verb, "value": length}) | from_entries)}) | sort_by(.total) | reverse' ./kube-apiserver-audit.log > ./api-call-stats.json - name: Compress artifacts @@ -150,7 +171,9 @@ runs: working-directory: ${{ runner.temp }} shell: bash run: | - set -x + set -euExo pipefail + shopt -s inherit_errexit + tar -c --use-compress-program=lz4 -f ./e2e-artifacts.tar.lz4 "e2e-artifacts/" - name: Upload artifacts if: ${{ always() }} diff --git a/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml b/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml new file mode 100644 index 00000000000..133951f521d --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: "{{ .name }}" +data: + username: {{ "admin" | toBytes | toBase64 }} + password: {{ .password | toBase64 }} diff --git a/assets/monitoring/grafana/v1alpha1/configs.cm.yaml b/assets/monitoring/grafana/v1alpha1/configs.cm.yaml new file mode 100644 index 00000000000..522020b8742 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/configs.cm.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana-configs" +data: + grafana.ini: | + [auth] + disable_login_form = false + disable_signout_menu = false + + {{ if .enableAnonymousAccess -}} + [auth.anonymous] + enabled = true + {{- end }} + + [dashboards] + default_home_dashboard_path = /var/run/dashboards/scylladb/overview.json + + [log] + level = error + mode = console + + [log.frontend] + enabled = true + + [paths] + data = /var/lib/grafana + logs = /var/log/grafana + plugins = /var/lib/grafana/plugins + provisioning = /var/run/configmaps/grafana-provisioning + + [security] + admin_user = $__file{/var/run/secrets/grafana-admin-credentials/username} + admin_password = $__file{/var/run/secrets/grafana-admin-credentials/password} + + [server] + protocol = https + cert_file = /var/run/secrets/grafana-serving-certs/tls.crt + cert_key = /var/run/secrets/grafana-serving-certs/tls.key + + [panels] + disable_sanitize_html = true diff --git a/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml b/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml new file mode 100644 index 00000000000..015759f59f0 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml @@ -0,0 +1,4686 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: '{{ .scyllaDBMonitoringName }}-grafana-scylladb-dashboards' +data: + overview.json: |- + {{`{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "class": "annotation_manager_task", + "datasource": "prometheus", + "enable": true, + "expr": "scylla_manager_task_active_count{type=~\"repair|backup\",cluster=~\"$cluster|$^\"}>0", + "hide": false, + "iconColor": "#73BF69", + "limit": 100, + "name": "Task", + "showIn": 0, + "tagKeys": "type", + "tags": [], + "titleFormat": "Running", + "type": "tags" + }, + { + "class": "mv_building", + "datasource": "prometheus", + "enable": true, + "expr": "sum(scylla_view_builder_builds_in_progress)>0", + "hide": false, + "iconColor": "rgb(50, 176, 0, 128)", + "limit": 100, + "name": "MV", + "showIn": 0, + "tagKeys": "instance,dc,cluster", + "tags": [], + "titleFormat": "Materialized View built", + "type": "tags" + }, + { + "class": "ops_annotation", + "datasource": "prometheus", + "enable": true, + "expr": "10*min(scylla_node_ops_finished_percentage) by (ops, dc,instance) < 10", + "hide": false, + "iconColor": "rgb(50, 176, 0, 128)", + "limit": 100, + "name": "ops", + "showIn": 0, + "tagKeys": "ops,dc,instance", + "tags": [], + "titleFormat": "Operation", + "type": "tags" + }, + { + "class": "annotation_schema_changed", + "datasource": "prometheus", + "enable": false, + "expr": "changes(scylla_database_schema_changed[$__rate_interval])>0", + "hide": false, + "iconColor": "rgba(255, 96, 96, 1)", + "limit": 100, + "name": "Schema Changed", + "showIn": 0, + "tagKeys": "instance,dc,cluster", + "tags": [], + "titleFormat": "schema changed", + "type": "tags" + } + ] + }, + "class": "dashboard", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": true, + "id": null, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [], + "type": "dashboards" + } + ], + "originalTitle": "Scylla Cluster Metrics", + "overwrite": true, + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Cluster overview $cluster", + "type": "row" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "si:" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "sum(rate(scylla_transport_requests_served{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[60s])) + (sum(rate(scylla_thrift_served{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[60s])) or on() vector(0))", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests/s", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "Average Write Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 50000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 3, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(wlatencya{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg Write", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "99% write Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 5, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(wlatencyp95{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "95% Write", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "99% write Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 7, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(wlatencyp99{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "99% Write", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "Average Read Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 50000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 9, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(rlatencya{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg Read", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "99% read Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 11, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(rlatencyp95{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "95% Read", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "99% read Latency", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100000 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 13, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(rlatencyp99{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "99% Read", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "description": "The percentage of the time during which Scylla utilized the CPU. Note that because Scylla does busy polling for some time before going idle, CPU utilization as seen by the operating system may be much higher. Your system is not yet CPU-bottlenecked until this metric is high.", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 15, + "y": 1 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"} )", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 17, + "y": 1 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "$func(rate(scylla_database_total_reads_failed{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "R Failed", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 19, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "$func(rate(scylla_database_total_writes_failed{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "W Failed", + "type": "stat" + }, + { + "class": "small_stat", + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 21, + "y": 1 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "sum(rate(scylla_storage_proxy_coordinator_write_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[1m]))", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Timeouts", + "type": "stat" + }, + { + "class": "alert_table", + "columns": [], + "datasource": "alertmanager", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": null, + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute" + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + } + ] + } + ] + }, + "fontSize": "100%", + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 5 + }, + "id": 13, + "links": [], + "options": { + "showHeader": true + }, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "span": 4, + "targets": [ + { + "active": true, + "annotations": true, + "filters": "job!=\"scylla_manager\",advisor=\"\"", + "legendFormat": "{{description}}", + "refId": "A", + "target": "Query" + } + ], + "title": "Active Alerts", + "transform": "table", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Time", + "summary", + "instance" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "Time": 0, + "instance": 1, + "summary": 2 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Write attempts - include all writes that reached the coordinator node, even if they will eventually fail", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 8, + "y": 5 + }, + "id": 14, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes", + "type": "timeseries" + }, + { + "class": "us_panel", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 12, + "y": 5 + }, + "id": 15, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "avg(wlatencyp95{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "A", + "step": 1 + }, + { + "expr": "avg(wlatencyp99{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "B", + "step": 1 + } + ], + "title": "Write Latencies", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Read attempts - include all reads that reached the coordinator node, even if they will eventually fail", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 16, + "y": 5 + }, + "id": 16, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A", + "step": 1 + } + ], + "title": "Reads", + "type": "timeseries" + }, + { + "class": "us_panel", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 5 + }, + "id": 17, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "avg(rlatencyp95{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "A", + "step": 1 + }, + { + "expr": "avg(rlatencyp99{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "B", + "step": 1 + } + ], + "title": "Read Latencies", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 18, + "panels": [], + "title": "", + "type": "row" + }, + { + "class": "plain_text", + "content": "

Advisor

", + "datasource": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 19, + "isNew": true, + "links": [], + "mode": "html", + "options": {}, + "span": 12, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 20, + "options": { + "content": "

Balance

\nAn Imbalance between shards or nodes may indicates a potential problem", + "mode": "html" + }, + "pluginVersion": "7.3.4", + "targets": [ + { + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "", + "type": "text" + }, + { + "class": "advisor_table", + "dashversion": ">4.1", + "datasource": "alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "dashboard" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "/d/${__data.fields.dashboard}-[[dash_version]]?refresh=30s&orgId=1&var-by=instance&from=${__from}&to=${__to}" + } + ] + }, + { + "id": "custom.width", + "value": 100 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "advisor" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Category" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "severity" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Open an issue", + "url": "https://github.com/scylladb/scylla/issues/new?body=description%3D${__data.fields[4]}%0ASource%3DAdvisor%0AScylla-versions%3D${all_scyllas_versions}%0Ascylla-monitoring%3D${monitoring_version}%0Acluster%3D${count_dc}%0Aname%3D${cluster}%0A%0A" + } + ] + }, + { + "id": "mappings", + "value": [ + { + "from": "0", + "id": 1, + "text": "\ud83d\udd14", + "to": "10", + "type": 2, + "value": "" + } + ] + }, + { + "id": "displayName", + "value": "Report" + }, + { + "id": "custom.width", + "value": 65 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "summary" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "${__data.fields.description}\n\n click for more information", + "url": "https://monitoring.docs.scylladb.com/branch-master/use-monitoring/advisor/${__data.fields.alertname}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "alertname" + }, + "properties": [ + { + "id": "displayName", + "value": "." + }, + { + "id": "custom.width", + "value": 1 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "description" + }, + "properties": [ + { + "id": "displayName", + "value": "." + }, + { + "id": "custom.width", + "value": 1 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 150 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 21, + "links": [], + "options": { + "showHeader": true + }, + "pluginVersion": "7.3.4", + "targets": [ + { + "active": true, + "annotations": true, + "filters": "advisor!=\"\"", + "legendFormat": "{{description}}", + "refId": "A", + "target": "Query" + } + ], + "title": "", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "advisor", + "dashboard", + "description", + "severity", + "alertname", + "summary", + "Time" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "Time": 1, + "advisor": 2, + "dashboard": 3, + "severity": 0, + "summary": 4 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 16 + }, + "id": 22, + "links": [ + { + "title": "The number of connections per shard should be balanced" + } + ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "max(abs(sum(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by (instance,shard)-scalar(avg(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"})))) - 8", + "hide": false, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Connections", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 14, + "y": 16 + }, + "id": 23, + "links": [ + { + "title": "Indicates that the number of CQL operations (inserts, updates, deletes, reads) is not balance between shards in one of the nodes" + } + ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "max(abs(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3", + "format": "time_series", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "max(abs(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "expr": "max(abs(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "expr": "max(abs(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CQL OPs", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 16, + "y": 16 + }, + "id": 24, + "links": [ + { + "title": "A single node with higher latency is an indication for a node related issue" + } + ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "((max(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "((max(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Node Latency", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 18, + "y": 16 + }, + "id": 25, + "links": [ + { + "title": "A single shard with high latency is an indication of a hot-partition, or a large row/cell/partition" + } + ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "((max(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "((max(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Shard Latency", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "A shard that reads more from the cache could be an indication for hot partition", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 20, + "y": 16 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "((rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))- scalar(avg(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))))/scalar(stddev(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))+100)-3", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Cache", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "class": "small_stat_error", + "datasource": "prometheus", + "description": "A single shard that reads more from sstables, could be that a node is slow", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "from": "-1000", + "id": 1, + "text": "\u2713", + "to": "0.001", + "type": 2, + "value": "" + }, + { + "from": "0.001", + "id": 2, + "text": "\u26a0", + "to": "10000", + "type": 2, + "value": "0.001" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.001 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 22, + "y": 16 + }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "max(abs(scylla_database_active_reads{ dc=~\"$dc\"} - scalar(avg(scylla_database_active_reads{ dc=~\"$dc\"})))/scalar(stddev(scylla_database_active_reads{ dc=~\"$dc\"})+0.001))-3", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "SSTable", + "transformations": [ + { + "id": "calculateField", + "options": { + "mode": "reduceRow", + "reduce": { + "reducer": "max" + }, + "replaceFields": true + } + } + ], + "type": "stat" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 28, + "panels": [], + "repeat": "dc", + "title": "", + "type": "row" + }, + { + "class": "plain_text", + "content": "

Information for $dc

", + "datasource": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 29, + "isNew": true, + "links": [], + "mode": "html", + "options": {}, + "span": 12, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }, + { + "class": "vertical_lcd", + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 1, + "x": 0, + "y": 25 + }, + "id": 30, + "options": { + "displayMode": "lcd", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{cluster=~\"$cluster\", dc=~\"$dc\"} )", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load", + "type": "bargauge" + }, + { + "class": "bytes_panel", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 1, + "y": 25 + }, + "id": 31, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "span": 5, + "targets": [ + { + "expr": "Avg(node_filesystem_size_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])-avg(node_filesystem_avail_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Avg Usage {{[[by]]}}", + "metric": "", + "refId": "A", + "step": 1 + }, + { + "expr": "avg(node_filesystem_size_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])", + "interval": "", + "legendFormat": "Size {{[[by]]}}", + "refId": "B" + } + ], + "title": "Disk Size by $by", + "type": "timeseries" + }, + { + "class": "graph_panel_int", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 4, + "y": 25 + }, + "id": 32, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "span": 2, + "targets": [ + { + "expr": "$func(scylla_compaction_manager_compactions{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Running Compactions", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "The Hits and Misses", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 8, + "y": 25 + }, + "id": 33, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "$func(rate(scylla_cache_row_hits{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Hit {{[[by]]}}", + "refId": "A", + "step": 10 + }, + { + "expr": "$func(rate(scylla_cache_row_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Misses {{[[by]]}}", + "refId": "B", + "step": 10 + } + ], + "title": "Cache Rows Hits/Misses", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Write attempts - include all writes that reached the coordinator node, even if they will eventually fail", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 14, + "y": 25 + }, + "id": 34, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + { + "alias": "1 Day Ago", + "dashLength": 4, + "dashes": true + }, + { + "alias": "1 Week Ago", + "dashLength": 2, + "dashes": true + } + ], + "span": 3, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A", + "step": 1 + }, + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m] offset 1d))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "1 Day Ago", + "refId": "B", + "step": 1 + }, + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m] offset 1w))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "1 Week Ago", + "refId": "C", + "step": 1 + } + ], + "title": "Writes", + "type": "timeseries" + }, + { + "class": "us_panel", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 25 + }, + "id": 35, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "avg(wlatencyp95{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "95% {{[[by]]}}", + "refId": "A", + "step": 1 + }, + { + "expr": "avg(wlatencyp99{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "99% {{[[by]]}}", + "refId": "B", + "step": 1 + } + ], + "title": "Write Latencies", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Requests that Scylla tried to write but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 31 + }, + "id": 36, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_write_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Writes {{[[by]]}}", + "refId": "A", + "step": 10 + } + ], + "title": "Write Timeouts by [[by]]", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Requests that Scylla tried to read but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 4, + "y": 31 + }, + "id": 37, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_read_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])+rate(scylla_storage_proxy_coordinator_cas_read_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])+rate(scylla_storage_proxy_coordinator_range_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Read {{[[by]]}}", + "refId": "A", + "step": 10 + } + ], + "title": "Read Timeouts by [[by]]", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "The Hits and Misses", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 8, + "y": 31 + }, + "id": 38, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "$func(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-$func(rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Hit {{[[by]]}}", + "refId": "A", + "step": 10 + }, + { + "expr": "$func(rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Misses {{[[by]]}}", + "refId": "B", + "step": 10 + } + ], + "title": "Cache Reads Hits/Misses", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Read attempts - include all reads that reached the coordinator node, even if they will eventually fail", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 14, + "y": 31 + }, + "id": 39, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + { + "alias": "1 Day Ago", + "dashLength": 4, + "dashes": true + }, + { + "alias": "1 Week Ago", + "dashLength": 2, + "dashes": true + } + ], + "span": 3, + "targets": [ + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A", + "step": 1 + }, + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m] offset 1d))", + "intervalFactor": 1, + "legendFormat": "1 Day Ago", + "refId": "B", + "step": 1 + }, + { + "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m] offset 1w))", + "intervalFactor": 1, + "legendFormat": "1 Week Ago", + "refId": "C", + "step": 1 + } + ], + "title": "Reads", + "type": "timeseries" + }, + { + "class": "us_panel", + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 31 + }, + "id": 40, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "class": "show_legend", + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 2, + "targets": [ + { + "expr": "avg(rlatencyp95{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by([[by]])", + "intervalFactor": 1, + "legendFormat": "95% {{[[by]]}}", + "refId": "A", + "step": 1 + }, + { + "expr": "avg(rlatencyp99{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by([[by]])", + "intervalFactor": 1, + "legendFormat": "99% {{[[by]]}}", + "refId": "B", + "step": 1 + } + ], + "title": "Read Latencies", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Number of CQL INSERT commands generated by the user", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 37 + }, + "id": 41, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_inserts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]]) - sum(rate(scylla_cql_inserts_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Insert", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Number of CQL SELECT commands generated by the user", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 37 + }, + "id": 42, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]]) - sum(rate(scylla_cql_reads_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Reads", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Number of CQL DELETE commands generated by the user", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 37 + }, + "id": 43, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_deletes{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-sum(rate(scylla_cql_deletes_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Deletes", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Number of CQL UPDATE commands generated by the user", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 37 + }, + "id": 44, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_updates{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-sum(rate(scylla_cql_updates_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Updates", + "type": "timeseries" + }, + { + "class": "graph_panel", + "datasource": "prometheus", + "description": "amount of CQL connections currently established", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 43 + }, + "id": 45, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Client CQL connections by [[by]]", + "type": "timeseries" + }, + { + "class": "graph_panel", + "datasource": "prometheus", + "description": "Number of CQL batches command, each batched command is counted once", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 43 + }, + "id": 46, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_batches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "CQL Batches by [[by]]", + "type": "timeseries" + }, + { + "class": "graph_panel", + "datasource": "prometheus", + "description": "Number of CQL command batched. Each batch would add the number of commands inside the batch", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 43 + }, + "id": 47, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_statements_in_batches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "CQL Command In Batches by [[by]]", + "type": "timeseries" + }, + { + "class": "ops_panel", + "datasource": "prometheus", + "description": "Counts the number of SELECT statements with BYPASS CACHE option", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 43 + }, + "id": 48, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_select_bypass_caches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "BYPASS CACHE", + "type": "timeseries" + }, + { + "class": "graph_panel", + "dashversion": [ + ">4.4", + ">2021.1" + ], + "datasource": "prometheus", + "description": "CQL errors by type, only active errors are shown", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 49 + }, + "id": 49, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_transport_cql_errors_total{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]],type) >0", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "CQL Errors [[by]]", + "type": "timeseries" + }, + { + "class": "graph_panel", + "datasource": "prometheus", + "description": "Number of CQL row reads", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 49 + }, + "id": 50, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_rows_read{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "CQL Row Reads [[by]]", + "type": "timeseries" + }, + { + "class": "graph_panel", + "datasource": "prometheus", + "description": "Number of reads using secondary indexes", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 49 + }, + "id": 51, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pointradius": 1, + "span": 3, + "targets": [ + { + "expr": "sum(rate(scylla_cql_secondary_index_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Secondary indexes Reads [[by]]", + "type": "timeseries" + }, + { + "class": "collapsible_row_panel", + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 55 + }, + "id": 52, + "panels": [], + "repeat": "", + "title": "Your panels", + "type": "row" + }, + { + "class": "plain_text", + "datasource": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 53, + "isNew": true, + "links": [], + "mode": "html", + "options": { + "content": "

Your Panels

", + "mode": "html" + }, + "span": 12, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }, + { + "class": "user_panel", + "datasource": "prometheus", + "description": "This graph panel was left empty on purpose for ad-hoc usage. Change it when needed. Pay attention that changes to the panel will not be saved.\n\nIf you do need a panel that can be saved, create a new dashboard, or edit the panel inside the json file", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 54, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 6, + "title": "Your Graph here", + "type": "timeseries" + }, + { + "class": "user_panel", + "datasource": "prometheus", + "description": "This graph panel was left empty on purpose for ad-hoc usage. Change it when needed. Pay attention that changes to the panel will not be saved.\n\nIf you do need a panel that can be saved, create a new dashboard, or edit the panel inside the json file", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "class": "fieldConfig_defaults", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "class": "fieldConfig_defaults_custom", + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "si:ops/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 55, + "isNew": true, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "seriesOverrides": [ + {} + ], + "span": 6, + "title": "Your Graph here", + "type": "timeseries" + }, + { + "class": "plain_text", + "datasource": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 56, + "isNew": true, + "links": [], + "mode": "html", + "options": { + "content": "
Scylla Monitoring version - master
\n    
", + "mode": "html" + }, + "span": 12, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + } + ], + "refresh": "30s", + "schemaVersion": 26, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "class": "by_template_var", + "current": { + "tags": [], + "text": "DC", + "value": "dc" + }, + "error": null, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": true, + "text": "DC", + "value": "dc" + } + ], + "query": "Cluster,DC,Instance,Shard", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": null, + "class": "template_variable_single", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": "prometheus", + "definition": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": "label_values(scylla_reactor_utilization, cluster)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "class": "template_variable_all", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "prometheus", + "definition": "", + "error": null, + "hide": 0, + "includeAll": true, + "label": "dc", + "multi": true, + "name": "dc", + "options": [], + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "class": "template_variable_single", + "current": { + "text": "/var/lib/scylla", + "value": "/var/lib/scylla" + }, + "datasource": "prometheus", + "definition": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Mount path", + "multi": false, + "name": "mount_point", + "options": [ + { + "selected": true, + "text": "/var/lib/scylla", + "value": "/var/lib/scylla" + } + ], + "query": "/var/lib/scylla", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "custom", + "useTags": false + }, + { + "allValue": null, + "class": "aggregation_function", + "current": { + "tags": [], + "text": "sum", + "value": "sum" + }, + "hide": 0, + "includeAll": false, + "label": "Function", + "multi": false, + "name": "func", + "options": [ + { + "selected": true, + "text": "sum", + "value": "sum" + }, + { + "selected": false, + "text": "avg", + "value": "avg" + }, + { + "selected": false, + "text": "max", + "value": "max" + }, + { + "selected": false, + "text": "min", + "value": "min" + }, + { + "selected": false, + "text": "stddev", + "value": "stddev" + }, + { + "selected": false, + "text": "stdvar", + "value": "stdvar" + } + ], + "query": "sum,avg,max,min,stddev,stdvar", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": null, + "class": "template_variable_all", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "prometheus", + "definition": "", + "error": null, + "hide": 2, + "includeAll": true, + "multi": true, + "name": "all_scyllas_versions", + "options": [], + "query": "label_values(scylla_scylladb_current_version{cluster=~\"$cluster|$^\"}, version)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "class": "template_variable_all", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "prometheus", + "definition": "query_result(count(up{job=\"scylla\"}) by (dc))", + "error": null, + "hide": 2, + "includeAll": true, + "multi": true, + "name": "count_dc", + "options": [], + "query": { + "query": "query_result(count(up{job=\"scylla\"}) by (dc))", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/(?\\{dc=\"[^\"]+\".* \\d+) .*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "class": "monitor_version_var", + "current": { + "text": "master", + "value": "master" + }, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "monitoring_version", + "options": [ + { + "selected": true, + "text": "master", + "value": "master" + } + ], + "query": "master", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "CQL Overview", + "uid": "cql-overview", + "version": 1 + }`}} diff --git a/assets/monitoring/grafana/v1alpha1/deployment.yaml b/assets/monitoring/grafana/v1alpha1/deployment.yaml new file mode 100644 index 00000000000..000d0af9233 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/deployment.yaml @@ -0,0 +1,124 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana" +spec: + selector: + matchLabels: + scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana" + strategy: + type: RollingUpdate + template: + metadata: + annotations: + scylla-operator.scylladb.com/inputs-hash: "{{ .restartTriggerHash }}" + labels: + scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana" + spec: + securityContext: + fsGroup: 472 + supplementalGroups: + - 0 + affinity: + {{- .affinity | toYAML | nindent 8 }} + tolerations: + {{- .tolerations | toYAML | nindent 8 }} + containers: + - name: grafana + image: docker.io/grafana/grafana:9.3.1 + command: + - grafana-server + - --packaging=docker + - --homepath=/usr/share/grafana + - --config=/var/run/configmaps/grafana-configs/grafana.ini + env: + - name: GF_PATHS_PROVISIONING + - name: GF_PATHS_HOME + - name: GF_PATHS_DATA + - name: GF_PATHS_LOGS + - name: GF_PATHS_PLUGINS + - name: GF_PATHS_CONFIG + ports: + - containerPort: 3000 + name: grafana + protocol: TCP + readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 1 + httpGet: + path: /api/health + port: 3000 + scheme: HTTPS + livenessProbe: + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 10 + httpGet: + path: /api/health + port: 3000 + scheme: HTTPS + resources: + {{- .resources | toYAML | nindent 10 }} + volumeMounts: + - name: grafana-configs + mountPath: /var/run/configmaps/grafana-configs + - name: grafana-scylladb-dashboards + mountPath: /var/run/dashboards/scylladb + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/access-control/access-control.yaml + subPath: access-control.yaml + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/alerting/alerting.yaml + subPath: alerting.yaml + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/dashboards/dashboards.yaml + subPath: dashboards.yaml + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/datasources/datasources.yaml + subPath: datasources.yaml + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/notifiers/notifiers.yaml + subPath: notifiers.yaml + - name: grafana-provisioning + mountPath: /var/run/configmaps/grafana-provisioning/plugins/plugins.yaml + subPath: plugins.yaml + - name: grafana-admin-credentials + mountPath: /var/run/secrets/grafana-admin-credentials + - name: grafana-serving-certs + mountPath: /var/run/secrets/grafana-serving-certs + - name: prometheus-client-certs + mountPath: /var/run/secrets/prometheus-client-certs + - name: prometheus-serving-ca + mountPath: /var/run/configmaps/prometheus-serving-ca + - name: grafana-storage + mountPath: /var/lib/grafana + volumes: + - name: grafana-configs + configMap: + name: "{{ .scyllaDBMonitoringName }}-grafana-configs" + - name: grafana-scylladb-dashboards + configMap: + name: "{{ .scyllaDBMonitoringName }}-grafana-scylladb-dashboards" + - name: grafana-provisioning + configMap: + name: "{{ .scyllaDBMonitoringName }}-grafana-provisioning" + - name: grafana-admin-credentials + secret: + secretName: "{{ .scyllaDBMonitoringName }}-grafana-admin-credentials" + - name: grafana-serving-certs + secret: + secretName: "{{ .servingCertSecretName }}" + - name: prometheus-client-certs + secret: + secretName: "{{ .scyllaDBMonitoringName }}-prometheus-client-grafana" + - name: prometheus-serving-ca + configMap: + name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-ca" + - name: grafana-storage + emptyDir: + sizeLimit: 100Mi diff --git a/assets/monitoring/grafana/v1alpha1/ingress.yaml b/assets/monitoring/grafana/v1alpha1/ingress.yaml new file mode 100644 index 00000000000..f589f0cf07b --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/ingress.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana" + annotations: + {{- .ingressAnnotations | toYAML | nindent 4 }} +spec: + ingressClassName: {{ or .ingressClassName "null" }} + rules: + {{- range $_, $dnsDomain := .dnsDomains }} + - host: "{{ $dnsDomain }}" + http: + paths: + - backend: + service: + name: "{{ $.scyllaDBMonitoringName }}-grafana" + port: + number: 3000 + path: / + pathType: Prefix + {{- end }} diff --git a/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml b/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml new file mode 100644 index 00000000000..91fc8662913 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana-provisioning" +data: + access-control.yaml: "" + alerting.yaml: "" + dashboards.yaml: | + apiVersion: 1 + providers: + - name: dashboards + type: file + updateIntervalSeconds: 30 + options: + path: /var/run/dashboards + foldersFromFilesStructure: true + datasources.yaml: | + apiVersion: 1 + datasources: + - name: prometheus + type: prometheus + access: proxy + url: "https://{{ .scyllaDBMonitoringName }}-prometheus:9090" + isDefault: true + version: 1 + editable: false + jsonData: + timeInterval: "5s" + tlsAuthWithCACert: true + secureJsonData: + tlsCACert: "$__file{/var/run/configmaps/prometheus-serving-ca/ca-bundle.crt}" + tlsClientCert: "$__file{/var/run/secrets/prometheus-client-certs/tls.crt}" + tlsClientKey: "$__file{/var/run/secrets/prometheus-client-certs/tls.key}" + notifiers.yaml: "" + plugins.yaml: "" diff --git a/assets/monitoring/grafana/v1alpha1/registry.go b/assets/monitoring/grafana/v1alpha1/registry.go new file mode 100644 index 00000000000..c56f4b24d58 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/registry.go @@ -0,0 +1,50 @@ +package v1alpha1 + +import ( + _ "embed" + + "github.com/scylladb/scylla-operator/pkg/assets" + "github.com/scylladb/scylla-operator/pkg/scheme" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] { + return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer()) +} + +var ( + //go:embed "deployment.yaml" + grafanaDeploymentTemplateString string + GrafanaDeploymentTemplate = ParseObjectTemplateOrDie[*appsv1.Deployment]("grafana-deployment", grafanaDeploymentTemplateString) + + //go:embed "serviceaccount.yaml" + grafanaSATemplateString string + GrafanaSATemplate = ParseObjectTemplateOrDie[*corev1.ServiceAccount]("grafana-sa", grafanaSATemplateString) + + //go:embed "configs.cm.yaml" + grafanaConfigsTemplateString string + GrafanaConfigsTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-configs-cm", grafanaConfigsTemplateString) + + //go:embed "admin-credentials.secret.yaml" + grafanaAdminCredentialsSecretTemplateString string + GrafanaAdminCredentialsSecretTemplate = ParseObjectTemplateOrDie[*corev1.Secret]("grafana-access-credentials-secret", grafanaAdminCredentialsSecretTemplateString) + + //go:embed "provisioning.cm.yaml" + grafanaProvisioningConfigMapTemplateString string + GrafanaProvisioningConfigMapTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-provisioning-cm", grafanaProvisioningConfigMapTemplateString) + + //go:embed "dashboards.cm.yaml" + grafanaDashboardsConfigMapTemplateString string + GrafanaDashboardsConfigMapTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-dashboard-cm", grafanaDashboardsConfigMapTemplateString) + + //go:embed "service.yaml" + grafanaServiceTemplateString string + GrafanaServiceTemplate = ParseObjectTemplateOrDie[*corev1.Service]("grafana-service", grafanaServiceTemplateString) + + //go:embed "ingress.yaml" + grafanaIngressTemplateString string + GrafanaIngressTemplate = ParseObjectTemplateOrDie[*networkingv1.Ingress]("grafana-ingress", grafanaIngressTemplateString) +) diff --git a/assets/monitoring/grafana/v1alpha1/service.yaml b/assets/monitoring/grafana/v1alpha1/service.yaml new file mode 100644 index 00000000000..895913892e9 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana" +spec: + type: ClusterIP + selector: + scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana" + ports: + - port: 3000 + protocol: TCP + targetPort: grafana diff --git a/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml b/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml new file mode 100644 index 00000000000..53ba45d54e2 --- /dev/null +++ b/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .scyllaDBMonitoringName }}-grafana" diff --git a/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml b/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml new file mode 100644 index 00000000000..66a7151997d --- /dev/null +++ b/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml @@ -0,0 +1,295 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: "{{ .scyllaDBMonitoringName }}-alerts" + labels: + scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}" +spec: + groups: + - name: scylla.rules + rules: {{` + - alert: cqlNonPrepared + expr: cql:non_prepared > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries are non-prepared' + summary: non prepared statments + - alert: cql:non_paged_no_system + expr: cql:non_paged > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + status: "1" + annotations: + description: 'Some SELECT queries are non-paged' + summary: non paged statments + - alert: cqlNoTokenAware + expr: cql:non_token_aware > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries are not token-aware' + summary: non token aware statments + - alert: cqlReverseOrder + expr: cql:reverse_queries > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries use reverse order' + summary: reverse order queries + - alert: cqlAllowFiltering + expr: cql:allow_filtering > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries use ALLOW FILTERING' + summary: Allow filtering queries + - alert: cqlCLAny + expr: cql:any_queries > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries use Consistency Level: ANY' + summary: non prepared statments + - alert: cqlCLAll + expr: cql:all_queries > 0 + for: 10s + labels: + severity: "1" + advisor: "cqlOptimization" + dashboard: "cql" + annotations: + description: 'Some queries use Consistency Level: ALL' + summary: non prepared statments + - alert: nonBalancedcqlTraffic + expr: abs(rate(scylla_cql_updates{conditional="no"}[1m]) - scalar(avg(rate(scylla_cql_updates{conditional="no"}[1m]))))/scalar(stddev(rate(scylla_cql_updates{conditional="no"}[1m]))+100) > 2 + for: 10s + labels: + severity: "1" + status: "1" + advisor: "balanced" + dashboard: "cql" + annotations: + description: 'CQL queries are not balanced among shards {{ $labels.instance }} shard {{ $labels.shard }}' + summary: CQL queries are not balanced + - alert: nodeLocalErrors + expr: sum(errors:local_failed) by (cluster, instance) > 0 + for: 10s + labels: + severity: "1" + advisor: "operationError" + dashboard: "scylla-detailed" + annotations: + description: 'Some operation failed at the replica side' + summary: Replica side Level error + - alert: nodeIOErrors + expr: sum(rate(scylla_reactor_aio_errors[60s])) by (cluster, instance) > 0 + for: 10s + labels: + severity: "1" + advisor: "operationError" + dashboard: "OS-master" + annotations: + description: 'IO Errors can indicate a node with a faulty disk {{ $labels.instance }}' + summary: IO Disk Error + - alert: nodeCLErrors + expr: sum(errors:operation_unavailable) by (cluster) > 0 + for: 10s + labels: + severity: "1" + advisor: "operationError" + dashboard: "scylla-detailed" + annotations: + description: 'Some operation failed due to consistency level' + summary: Consistency Level error + - alert: preparedCacheEviction + expr: sum(rate(scylla_cql_prepared_cache_evictions[2m])) by (cluster) + sum(rate(scylla_cql_authorized_prepared_statements_cache_evictions[2m])) by (cluster) > 100 + for: 5m + labels: + severity: "1" + advisor: "preparedEviction" + dashboard: "scylla-detailed" + annotations: + description: 'The prepared-statement cache is being continuously evicted, which could indicate a problem in your prepared-statement usage logic.' + summary: Prepared cache eviction + - alert: heavyCompaction + expr: max(scylla_scheduler_shares{group="compaction"}) by (cluster) >= 1000 + for: 20m + labels: + severity: "1" + advisor: "heavyCompaction" + dashboard: "scylla-detailed" + annotations: + description: 'Compaction load increases to a level it can interfere with the system behaviour. If this persists set the compaction share to a static level.' + summary: Heavy compaction load + - alert: shedRequests + expr: max(sum(rate(scylla_transport_requests_shed[60s])) by (instance,cluster)/sum(rate(scylla_transport_requests_served{}[60s])) by (instance, cluster)) by(cluster) > 0.01 + for: 5m + labels: + severity: "1" + advisor: "systemOverload" + dashboard: "scylla-detailed" + annotations: + description: 'More than 1% of the requests got shed, this is an indication of an overload, consider system resize.' + summary: System is overloaded + - alert: cappedTombstone + expr: changes(scylla_sstables_capped_tombstone_deletion_time[1h]) > 0 + for: 1m + labels: + severity: "1" + advisor: "cappedTombstone" + dashboard: "scylla-detailed" + annotations: + description: 'Tombstone delete time was set too far in the future and was capped' + summary: Tobmstone delete time is capped + - alert: InstanceDown + expr: up{job="scylla"} == 0 + for: 30s + labels: + severity: "2" + annotations: + description: '{{ $labels.instance }} has been down for more than 30 seconds.' + summary: Instance {{ $labels.instance }} down + - alert: InstanceDown + expr: absent(scylla_transport_requests_served{job="scylla", shard="0"}) + for: 1m + labels: + severity: "2" + annotations: + description: '{{ $labels.instance }} instance is shutting down.' + summary: Instance {{ $labels.instance }} down + - alert: InstanceDown + expr: scylla_node_operation_mode > 3 + for: 30s + labels: + severity: "2" + annotations: + description: '{{ $labels.instance }} instance is shutting down.' + summary: Instance {{ $labels.instance }} down + - alert: DiskFull + expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"} + * 100 < 35 + for: 30s + labels: + severity: "2" + annotations: + description: '{{ $labels.instance }} has less than 35% free disk space.' + summary: Instance {{ $labels.instance }} low disk space + - alert: DiskFull + expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"} + * 100 < 25 + for: 30s + labels: + severity: "3" + annotations: + description: '{{ $labels.instance }} has less than 25% free disk space.' + summary: Instance {{ $labels.instance }} low disk space + - alert: DiskFull + expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"} + * 100 < 15 + for: 30s + labels: + severity: "4" + annotations: + description: '{{ $labels.instance }} has less than 15% free disk space.' + summary: Instance {{ $labels.instance }} low disk space + - alert: DiskFull + expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} + * 100 < 20 + for: 30s + labels: + severity: "3" + annotations: + description: '{{ $labels.instance }} has less than 20% free disk space on the root partition.' + summary: Instance {{ $labels.instance }} low disk space + - alert: NoCql + expr: scylla_manager_healthcheck_cql_status == -1 + for: 30s + labels: + severity: "2" + annotations: + description: '{{ $labels.host }} has denied CQL connection for more than 30 seconds.' + summary: Instance {{ $labels.host }} no CQL connection + - alert: HighLatencies + expr: wlatencyp95{by="instance"} > 100000 + for: 5m + labels: + severity: "1" + annotations: + description: '{{ $labels.instance }} has 95% high latency for more than 5 minutes.' + summary: Instance {{ $labels.instance }} High Write Latency + - alert: HighLatencies + expr: wlatencya{by="instance"} >10000 + for: 5m + labels: + severity: "1" + annotations: + description: '{{ $labels.instance }} has average high latency for more than 5 minutes.' + summary: Instance {{ $labels.instance }} High Write Latency + - alert: HighLatencies + expr: rlatencyp95{by="instance"} > 100000 + for: 5m + labels: + severity: "1" + annotations: + description: '{{ $labels.instance }} has 95% high latency for more than 5 minutes.' + summary: Instance {{ $labels.instance }} High Read Latency + - alert: HighLatencies + expr: rlatencya{by="instance"} >10000 + for: 5m + labels: + severity: "1" + annotations: + description: '{{ $labels.instance }} has average high latency for more than 5 minutes.' + summary: Instance {{ $labels.instance }} High Read Latency + - alert: BackupFailed + expr: (sum(scylla_manager_scheduler_run_total{type=~"backup", status="ERROR"}) or vector(0)) - (sum(scylla_manager_scheduler_run_total{type=~"backup", status="ERROR"} offset 3m) or vector(0)) > 0 + for: 10s + labels: + severity: "1" + annotations: + description: 'Backup failed' + summary: Backup task failed + - alert: RepairFailed + expr: (sum(scylla_manager_scheduler_run_total{type=~"repair", status="ERROR"}) or vector(0)) - (sum(scylla_manager_scheduler_run_total{type=~"repair", status="ERROR"} offset 3m) or vector(0)) > 0 + for: 10s + labels: + severity: "1" + annotations: + description: 'Repair failed' + summary: Repair task failed + - alert: restart + expr: resets(scylla_gossip_heart_beat[1h])>0 + for: 10s + labels: + severity: "1" + annotations: + description: 'Node restarted' + summary: Instance {{ $labels.instance }} restarted + - alert: oomKill + expr: changes(node_vmstat_oom_kill[1h])>0 + for: 10s + labels: + severity: "2" + annotations: + description: 'OOM Kill on {{ $labels.instance }}' + summary: A process was terminated on Instance {{ $labels.instance }} +`}} diff --git a/assets/monitoring/prometheus/v1/ingress.yaml b/assets/monitoring/prometheus/v1/ingress.yaml new file mode 100644 index 00000000000..f318487fd94 --- /dev/null +++ b/assets/monitoring/prometheus/v1/ingress.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "{{ .scyllaDBMonitoringName }}-prometheus" + annotations: + {{- .ingressAnnotations | toYAML | nindent 4 }} +spec: + ingressClassName: {{ or .ingressClassName "null" }} + rules: + {{- range $_, $dnsDomain := .dnsDomains }} + - host: "{{ $dnsDomain }}" + http: + paths: + - backend: + service: + name: "{{ $.scyllaDBMonitoringName }}-prometheus" + port: + number: 9090 + path: / + pathType: Prefix + {{- end }} diff --git a/assets/monitoring/prometheus/v1/prometheus.yaml b/assets/monitoring/prometheus/v1/prometheus.yaml new file mode 100644 index 00000000000..7765d380f90 --- /dev/null +++ b/assets/monitoring/prometheus/v1/prometheus.yaml @@ -0,0 +1,50 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: "{{ .scyllaDBMonitoringName }}" +spec: + serviceAccountName: "{{ .scyllaDBMonitoringName }}-prometheus" + securityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + web: + pageTitle: "ScyllaDB Prometheus" + tlsConfig: + cert: + secret: + name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-certs" + key: "tls.crt" + keySecret: + name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-certs" + key: "tls.key" +# clientAuthType: "RequireAndVerifyClientCert" +# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs + clientAuthType: "RequestClientCert" + client_ca: + configMap: + name: "{{ .scyllaDBMonitoringName }}-prometheus-client-ca" + key: "ca-bundle.crt" + httpConfig: + http2: true + serviceMonitorSelector: + matchLabels: {} + affinity: + {{- .affinity | toYAML | nindent 4 }} + tolerations: + {{- .tolerations | toYAML | nindent 4 }} + resources: + {{- .resources | toYAML | nindent 4 }} + alerting: + alertmanagers: + - namespace: "{{ .namespace }}" + name: "{{ .scyllaDBMonitoringName }}" + port: web + ruleSelector: + matchLabels: + scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}" + {{- if .volumeClaimTemplate }} + storage: + volumeClaimTemplate: + {{- .volumeClaimTemplate | toYAML | nindent 6 }} + {{- end }} diff --git a/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml b/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml new file mode 100644 index 00000000000..3386f9d730a --- /dev/null +++ b/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml @@ -0,0 +1,229 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: "{{ .scyllaDBMonitoringName }}-recoding" + labels: + scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}" +spec: + groups: + - name: scylla.rules + rules: + - record: cql:all_shardrate1m + expr: sum(rate(scylla_cql_reads[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_cql_inserts[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_updates[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_deletes[60s])) by (cluster, dc, instance, shard) + - record: cql:all_system_shardrate1m + expr: sum(rate(scylla_cql_reads_per_ks{ks="system"}[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_cql_inserts_per_ks{ks="system"}[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_updates_per_ks{ks="system"}[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_deletes_per_ks{ks="system"}[60s])) by (cluster, dc, instance, shard) + - record: cql:local_shardrate1m + expr: sum(rate(scylla_storage_proxy_coordinator_reads_local_node[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_storage_proxy_coordinator_total_write_attempts_local_node[60s]) ) by (cluster, dc, instance, shard) + - record: cql:all_rate1m + expr: sum(cql:all_shardrate1m) by (cluster, dc, instance) + - record: cql:non_token_aware + expr: (sum(cql:all_rate1m) by (cluster) >bool 100) * clamp_min(1-(sum(cql:local_shardrate1m) by (cluster) / sum(cql:all_rate1m) by (cluster)), 0) + - record: cql:non_system_prepared1m + expr: clamp_min(sum(rate(scylla_query_processor_statements_prepared[1m])) by (cluster, dc, instance, shard) - cql:all_system_shardrate1m, 0) + - record: cql:non_prepared + expr: (sum(cql:non_system_prepared1m) by (cluster) >bool 100) * (sum(cql:non_system_prepared1m) by (cluster) / clamp_min(sum(cql:all_rate1m) by (cluster)- sum(cql:all_system_shardrate1m) by (cluster), 0.001)) + - record: cql:non_paged_no_system1m + expr: clamp_min(sum(rate(scylla_cql_unpaged_select_queries[60s])) by (cluster, dc, instance) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster, dc, instance), 0) + - record: cql:non_paged_no_system + expr: (sum(cql:non_paged_no_system1m) by (cluster, dc, instance) >bool 100) * sum(cql:non_paged_no_system) by (cluster, dc, instance)/clamp_min(sum(rate(scylla_cql_reads[60s]))by (cluster, dc, instance) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster, dc, instance), 0.01) + - record: cql:non_paged + expr: (sum(cql:non_paged_no_system1m) by (cluster) >bool 100) * sum(cql:non_paged_no_system1m) by (cluster)/clamp_min(sum(rate(scylla_cql_reads[60s]))by (cluster) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster), 0.01) + - record: cql:reverse_queries + expr: sum(rate(scylla_cql_reverse_queries[60s])) by (cluster)/ sum(rate(scylla_cql_reads[60s])) by (cluster) + - record: cql:allow_filtering + expr: sum(rate(scylla_cql_filtered_read_requests[60s])) by (cluster)/ sum(rate(scylla_cql_reads[60s])) by (cluster) + - record: cql:any_queries + expr: sum(rate(scylla_query_processor_queries{consistency_level="ANY"}[60s])) by (cluster) >bool 0 + - record: cql:all_queries + expr: sum(rate(scylla_query_processor_queries{consistency_level="ALL"}[60s])) by (cluster) >bool 0 + - record: errors:operation_unavailable + expr: sum(rate(scylla_storage_proxy_coordinator_read_unavailable[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_write_unavailable[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_range_unavailable[60s])) by (cluster, dc, instance) + - record: errors:local_failed + expr: sum(rate(scylla_storage_proxy_coordinator_read_errors_local_node[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_write_errors_local_node[60s])) by (cluster, dc, instance) + - record: errors:nodes_total + expr: errors:local_failed + errors:operation_unavailable + - record: manager:repair_done_ts + expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="repair"}[60s])) by (cluster) > 0) or on(cluster) manager:repair_done_ts + - record: manager:backup_done_ts + expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="backup"}[60s])) by (cluster) > 0) or on(cluster) manager:backup_done_ts + - record: manager:repair_fail_ts + expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="repair"}[60s])) by (cluster) > 0) or on(cluster) manager:repair_fail_ts + - record: manager:backup_fail_ts + expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="backup"}[60s])) by (cluster) > 0) or on(cluster) manager:backup_fail_ts + - record: manager:repair_progress + expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster)) + - record: scylla_manager_repair_progress + expr: sum(manager:repair_progress) by (cluster) + labels: + level: "1" + by: "cluster" + - record: manager:backup_progress + expr: (max(scylla_manager_scheduler_run_indicator{type="backup"}) by (cluster) >bool 0)*((max(scylla_manager_backup_files_size_bytes) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_backup_files_uploaded_bytes) by (cluster) + sum(scylla_manager_backup_files_skipped_bytes) by (cluster) + sum(scylla_manager_backup_files_failed_bytes)by(cluster))/sum(scylla_manager_backup_files_size_bytes>=0) by (cluster)) + - record: scylla_manager_backup_progress + expr: sum(manager:backup_progress) by (cluster) + labels: + level: "1" + by: "cluster" + - record: wlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) + labels: + by: "instance,shard" + level: "1" + - record: wlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) + labels: + by: "instance" + level: "1" + - record: wlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le)) + labels: + by: "dc" + level: "1" + - record: wlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le)) + labels: + by: "cluster" + level: "1" + - record: rlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) + labels: + by: "instance,shard" + level: "1" + - record: rlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) + labels: + by: "instance" + level: "1" + - record: rlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le)) + labels: + by: "dc" + level: "1" + - record: rlatencyp99 + expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le)) + labels: + by: "cluster" + level: "1" + - record: wlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) + labels: + by: "instance,shard" + level: "1" + - record: wlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) + labels: + by: "instance" + level: "1" + - record: wlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le)) + labels: + by: "dc" + level: "1" + - record: wlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le)) + labels: + by: "cluster" + level: "1" + - record: rlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le)) + labels: + by: "instance,shard" + level: "1" + - record: rlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le)) + labels: + by: "instance" + level: "1" + - record: rlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le)) + labels: + by: "dc" + level: "1" + - record: rlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le)) + labels: + by: "cluster" + level: "1" + - record: wlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name, shard)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, dc, instance, scheduling_group_name, shard) + labels: + by: "instance,shard" + level: "1" + - record: wlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, dc, scheduling_group_name, instance) + labels: + by: "instance" + level: "1" + - record: wlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, scheduling_group_name, dc) + labels: + by: "dc" + level: "1" + - record: wlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, scheduling_group_name) + labels: + by: "cluster" + level: "1" + - record: rlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc, instance, shard,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name) + labels: + by: "instance,shard" + level: "1" + - record: rlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, instance, scheduling_group_name) + labels: + by: "instance" + level: "1" + - record: rlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, scheduling_group_name) + labels: + by: "dc" + level: "1" + - record: rlatencya + expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, scheduling_group_name) + labels: + by: "cluster" + level: "1" + - record: casrlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name)) + labels: + by: "instance,shard" + level: "1" + - record: casrlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) + labels: + by: "instance" + level: "1" + - record: casrlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, le, scheduling_group_name)) + labels: + by: "dc" + level: "1" + - record: casrlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, le, scheduling_group_name)) + labels: + by: "cluster" + level: "1" + - record: caswlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name)) + labels: + by: "instance,shard" + level: "1" + - record: caswlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, instance, le, scheduling_group_name)) + labels: + by: "instance" + level: "1" + - record: caswlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, le, scheduling_group_name)) + labels: + by: "dc" + level: "1" + - record: caswlatencyp95 + expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, le, scheduling_group_name)) + labels: + by: "cluster" + level: "1" + - record: all_scheduling_group + expr: sum(scylla_storage_proxy_coordinator_write_latency_count>0) by (cluster, scheduling_group_name) + diff --git a/assets/monitoring/prometheus/v1/registry.go b/assets/monitoring/prometheus/v1/registry.go new file mode 100644 index 00000000000..f8f76077cfc --- /dev/null +++ b/assets/monitoring/prometheus/v1/registry.go @@ -0,0 +1,51 @@ +package v1 + +import ( + _ "embed" + + "github.com/scylladb/scylla-operator/pkg/assets" + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + "github.com/scylladb/scylla-operator/pkg/scheme" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] { + return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer()) +} + +var ( + //go:embed "prometheus.yaml" + prometheusTemplateString string + PrometheusTemplate = ParseObjectTemplateOrDie[*monitoringv1.Prometheus]("prometheus", prometheusTemplateString) + + //go:embed "serviceaccount.yaml" + prometheusSATemplateString string + PrometheusSATemplate = ParseObjectTemplateOrDie[*corev1.ServiceAccount]("prometheus-sa", prometheusSATemplateString) + + //go:embed "rolebinding.yaml" + prometheusRoleBindingTemplateString string + PrometheusRoleBindingTemplate = ParseObjectTemplateOrDie[*rbacv1.RoleBinding]("prometheus-rolebinding", prometheusRoleBindingTemplateString) + + //go:embed "service.yaml" + prometheusServiceTemplateString string + PrometheusServiceTemplate = ParseObjectTemplateOrDie[*corev1.Service]("prometheus-service", prometheusServiceTemplateString) + + //go:embed "scylladb.servicemonitor.yaml" + scyllaDBServiceMonitorTemplateString string + ScyllaDBServiceMonitorTemplate = ParseObjectTemplateOrDie[*monitoringv1.ServiceMonitor]("scylladb-servicemonitor", scyllaDBServiceMonitorTemplateString) + + //go:embed "recording.prometheusrule.yaml" + recordingPrometheusRuleTemplateString string + RecordingPrometheusRuleTemplate = ParseObjectTemplateOrDie[*monitoringv1.PrometheusRule]("recording-prometheus-rule", recordingPrometheusRuleTemplateString) + + //go:embed "alerts.prometheusrule.yaml" + alertsPrometheusRuleTemplateString string + AlertsPrometheusRuleTemplate = ParseObjectTemplateOrDie[*monitoringv1.PrometheusRule]("alerts-prometheus-rule", alertsPrometheusRuleTemplateString) + + //go:embed "ingress.yaml" + prometheusIngressTemplateString string + PrometheusIngressTemplate = ParseObjectTemplateOrDie[*networkingv1.Ingress]("prometheus-ingress", prometheusIngressTemplateString) +) diff --git a/assets/monitoring/prometheus/v1/rolebinding.yaml b/assets/monitoring/prometheus/v1/rolebinding.yaml new file mode 100644 index 00000000000..fdd6fb46119 --- /dev/null +++ b/assets/monitoring/prometheus/v1/rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: "{{ .scyllaDBMonitoringName }}-prometheus" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: scylladb:monitoring:prometheus +subjects: +- kind: ServiceAccount + name: "{{ .scyllaDBMonitoringName }}-prometheus" + namespace: "{{ .namespace }}" diff --git a/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml b/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml new file mode 100644 index 00000000000..92daafa20e8 --- /dev/null +++ b/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml @@ -0,0 +1,74 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: "{{ .scyllaDBMonitoringName }}-scylladb" +spec: + selector: + {{- .endpointsSelector | toYAML | nindent 4 }} + jobLabel: scylla/cluster + endpoints: + - port: node-exporter + honorLabels: false + relabelings: + - sourceLabels: [__address__] + regex: '(.*):\d+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__address__] + regex: '([^:]+)' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [instance] + regex: '(.*)' + targetLabel: __address__ + replacement: '${1}:9100' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' + - port: prometheus + honorLabels: false + metricRelabelings: + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CPU + replacement: 'cpu' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CQL + replacement: 'cql' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: OS + replacement: 'os' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: IO + replacement: 'io' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: Errors + replacement: 'errors' + - regex: 'help|exported_instance' + action: labeldrop + - sourceLabels: [version] + regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*' + replacement: '$1$2' + targetLabel: svr + relabelings: + - sourceLabels: [__address__] + regex: '(.*):.+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' diff --git a/assets/monitoring/prometheus/v1/service.yaml b/assets/monitoring/prometheus/v1/service.yaml new file mode 100644 index 00000000000..d6ddf3f15f6 --- /dev/null +++ b/assets/monitoring/prometheus/v1/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: "{{ .scyllaDBMonitoringName }}-prometheus" +spec: + type: ClusterIP + ports: + - name: web + port: 9090 + protocol: TCP + targetPort: web + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: "{{ .scyllaDBMonitoringName }}" diff --git a/assets/monitoring/prometheus/v1/serviceaccount.yaml b/assets/monitoring/prometheus/v1/serviceaccount.yaml new file mode 100644 index 00000000000..952477d15c5 --- /dev/null +++ b/assets/monitoring/prometheus/v1/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .scyllaDBMonitoringName }}-prometheus" diff --git a/examples/monitoring/v1alpha1/scylladbmonitoring.yaml b/examples/monitoring/v1alpha1/scylladbmonitoring.yaml new file mode 100644 index 00000000000..86d8b699f5b --- /dev/null +++ b/examples/monitoring/v1alpha1/scylladbmonitoring.yaml @@ -0,0 +1,28 @@ +apiVersion: scylla.scylladb.com/v1alpha1 +kind: ScyllaDBMonitoring +metadata: + name: example +spec: + endpointsSelector: + matchLabels: + app.kubernetes.io/name: scylla + scylla-operator.scylladb.com/scylla-service-type: identity + scylla/cluster: replace-with-your-scyllacluster-name + components: + prometheus: + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 1Gi + grafana: + exposeOptions: + webInterface: + ingress: + ingressClassName: haproxy + dnsDomains: + - test-grafana.test.svc.cluster.local + annotations: + haproxy-ingress.github.io/ssl-passthrough: "true" + diff --git a/hack/ci-deploy.sh b/hack/ci-deploy.sh index c0f9cace888..7afbbbbbca9 100755 --- a/hack/ci-deploy.sh +++ b/hack/ci-deploy.sh @@ -80,4 +80,5 @@ kubectl -n haproxy-ingress rollout status --timeout=5m deployment.apps/haproxy-i kubectl wait --for condition=established crd/nodeconfigs.scylla.scylladb.com kubectl wait --for condition=established crd/scyllaoperatorconfigs.scylla.scylladb.com +kubectl wait --for condition=established crd/scylladbmonitorings.scylla.scylladb.com kubectl wait --for condition=established $( find "${deploy_dir}/prometheus-operator/" -name '*.crd.yaml' -printf '-f=%p\n' ) diff --git a/helm/scylla-operator/templates/clusterrole_def.yaml b/helm/scylla-operator/templates/clusterrole_def.yaml index e766be874d2..d4a0c2c0ed9 100644 --- a/helm/scylla-operator/templates/clusterrole_def.yaml +++ b/helm/scylla-operator/templates/clusterrole_def.yaml @@ -17,6 +17,7 @@ rules: - "" resources: - nodes + - endpoints verbs: - get - list @@ -65,6 +66,7 @@ rules: - list - watch - create + - delete - update - patch - apiGroups: @@ -83,14 +85,16 @@ rules: - apps resources: - statefulsets + - daemonsets + - deployments verbs: - create - - delete - get - list - - patch - - update - watch + - update + - patch + - delete - apiGroups: - apps resources: @@ -101,6 +105,7 @@ rules: - scylla.scylladb.com resources: - scyllaclusters + - scylladbmonitorings verbs: - create - delete @@ -113,6 +118,7 @@ rules: - scylla.scylladb.com resources: - scyllaclusters/status + - scylladbmonitorings/status verbs: - get - list @@ -155,18 +161,6 @@ rules: - patch - update - watch -- apiGroups: - - apps - resources: - - daemonsets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - apiGroups: - scylla.scylladb.com resources: @@ -265,3 +259,17 @@ rules: - patch - update - watch +- apiGroups: + - monitoring.coreos.com + resources: + - prometheuses + - prometheusrules + - servicemonitors + verbs: + - get + - list + - watch + - create + - patch + - update + - delete diff --git a/helm/scylla-operator/templates/edit_clusterrole.yaml b/helm/scylla-operator/templates/edit_clusterrole.yaml index b2875778341..8e34147109e 100644 --- a/helm/scylla-operator/templates/edit_clusterrole.yaml +++ b/helm/scylla-operator/templates/edit_clusterrole.yaml @@ -10,6 +10,7 @@ rules: - scylla.scylladb.com resources: - scyllaclusters + - scylladbmonitorings verbs: - create - patch diff --git a/helm/scylla-operator/templates/view_clusterrole.yaml b/helm/scylla-operator/templates/view_clusterrole.yaml index b5cc095aaef..44134b6dabe 100644 --- a/helm/scylla-operator/templates/view_clusterrole.yaml +++ b/helm/scylla-operator/templates/view_clusterrole.yaml @@ -11,6 +11,7 @@ rules: - scylla.scylladb.com resources: - scyllaclusters + - scylladbmonitorings verbs: - get - list diff --git a/pkg/assets/decode.go b/pkg/assets/decode.go new file mode 100644 index 00000000000..8507aa64a72 --- /dev/null +++ b/pkg/assets/decode.go @@ -0,0 +1,49 @@ +// Copyright (C) 2023 ScyllaDB + +package assets + +import ( + "fmt" + "text/template" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/klog/v2" +) + +func Decode[T any](data []byte, decoder runtime.Decoder) (T, error) { + obj, _, err := decoder.Decode(data, nil, nil) + if err != nil { + return *new(T), fmt.Errorf("can't decode object: %w", err) + } + + typedObj, ok := obj.(T) + if !ok { + return *new(T), fmt.Errorf("can't cast decoded object of type %t: %w", obj, err) + } + + return typedObj, nil +} + +func RenderAndDecode[T runtime.Object](tmpl *template.Template, inputs any, decoder runtime.Decoder) (T, string, error) { + renderedBytes, err := RenderTemplate(tmpl, inputs) + if err != nil { + return *new(T), "", fmt.Errorf("can't render template: %w", err) + } + + obj, err := Decode[T](renderedBytes, decoder) + if err != nil { + // Rendered templates can contain secret data that we can't log in the regular flow. + var redactedString string + switch runtime.Object(*new(T)).(type) { + case *corev1.Secret: + redactedString = "" + default: + redactedString = string(renderedBytes) + } + klog.Errorf("Can't decode rendered template %q: %v. Template:\n%s", tmpl.Name(), err, redactedString) + return *new(T), string(renderedBytes), fmt.Errorf("can't decode rendered template %q: %w", tmpl.Name(), err) + } + + return obj, string(renderedBytes), nil +} diff --git a/pkg/assets/object_template.go b/pkg/assets/object_template.go new file mode 100644 index 00000000000..0232c0917ed --- /dev/null +++ b/pkg/assets/object_template.go @@ -0,0 +1,34 @@ +package assets + +import ( + "fmt" + "text/template" + + "github.com/scylladb/scylla-operator/pkg/helpers" + "k8s.io/apimachinery/pkg/runtime" +) + +type ObjectTemplate[T runtime.Object] struct { + tmpl *template.Template + decoder runtime.Decoder +} + +func ParseObjectTemplate[T runtime.Object](name, tmplString string, funcMap template.FuncMap, decoder runtime.Decoder) (ObjectTemplate[T], error) { + tmpl, err := template.New(name).Funcs(funcMap).Parse(tmplString) + if err != nil { + return *new(ObjectTemplate[T]), fmt.Errorf("can't parse template %q: %w", name, err) + } + + return ObjectTemplate[T]{ + tmpl: tmpl, + decoder: decoder, + }, nil +} + +func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string, funcMap template.FuncMap, decoder runtime.Decoder) ObjectTemplate[T] { + return helpers.Must(ParseObjectTemplate[T](name, tmplString, funcMap, decoder)) +} + +func (t *ObjectTemplate[T]) RenderObject(inputs any) (T, string, error) { + return RenderAndDecode[T](t.tmpl, inputs, t.decoder) +} diff --git a/pkg/assets/template.go b/pkg/assets/template.go index c6784b2e7af..2c15acbb0ec 100644 --- a/pkg/assets/template.go +++ b/pkg/assets/template.go @@ -2,16 +2,50 @@ package assets import ( "bytes" + "encoding/base64" "fmt" + "strings" "text/template" + + "sigs.k8s.io/yaml" ) -func RenderTemplate(tmpl *template.Template, data any) ([]byte, error) { +var TemplateFuncs template.FuncMap = template.FuncMap{ + "toYAML": marshalYAML, + "indent": indent, + "nindent": nindent, + "toBytes": toBytes, + "toBase64": toBase64, +} + +func marshalYAML(v any) (string, error) { + bytes, err := yaml.Marshal(v) + return strings.TrimSpace(string(bytes)), err +} + +func indent(spaceCount int, s string) string { + spaces := strings.Repeat(" ", spaceCount) + return spaces + strings.Replace(s, "\n", "\n"+spaces, -1) +} + +func nindent(spaceCount int, s string) string { + return "\n" + indent(spaceCount, s) +} + +func toBytes(s string) []byte { + return []byte(s) +} + +func toBase64(data []byte) string { + return base64.StdEncoding.EncodeToString(data) +} + +func RenderTemplate(tmpl *template.Template, inputs any) ([]byte, error) { // We always want correctness. (Accidentally missing a key might have side effects.) tmpl.Option("missingkey=error") var buf bytes.Buffer - err := tmpl.Execute(&buf, data) + err := tmpl.Execute(&buf, inputs) if err != nil { return nil, fmt.Errorf("can't execute template %q: %w", tmpl.Name(), err) } diff --git a/pkg/cmd/operator/operator.go b/pkg/cmd/operator/operator.go index 2b9570a9b22..cd3fd391377 100644 --- a/pkg/cmd/operator/operator.go +++ b/pkg/cmd/operator/operator.go @@ -12,7 +12,10 @@ import ( "github.com/scylladb/scylla-operator/pkg/controller/nodeconfigpod" "github.com/scylladb/scylla-operator/pkg/controller/orphanedpv" "github.com/scylladb/scylla-operator/pkg/controller/scyllacluster" + "github.com/scylladb/scylla-operator/pkg/controller/scylladbmonitoring" "github.com/scylladb/scylla-operator/pkg/controller/scyllaoperatorconfig" + monitoringversionedclient "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned" + monitoringinformers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/informers/externalversions" "github.com/scylladb/scylla-operator/pkg/genericclioptions" "github.com/scylladb/scylla-operator/pkg/leaderelection" "github.com/scylladb/scylla-operator/pkg/naming" @@ -34,8 +37,9 @@ type OperatorOptions struct { genericclioptions.InClusterReflection genericclioptions.LeaderElection - kubeClient kubernetes.Interface - scyllaClient scyllaversionedclient.Interface + kubeClient kubernetes.Interface + scyllaClient scyllaversionedclient.Interface + monitoringClient monitoringversionedclient.Interface ConcurrentSyncs int OperatorImage string @@ -140,6 +144,11 @@ func (o *OperatorOptions) Complete() error { return fmt.Errorf("can't build scylla clientset: %w", err) } + o.monitoringClient, err = monitoringversionedclient.NewForConfig(o.RestConfig) + if err != nil { + return fmt.Errorf("can't build monitoring clientset: %w", err) + } + return nil } @@ -183,6 +192,8 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS }, )) + monitoringInformers := monitoringinformers.NewSharedInformerFactory(o.monitoringClient, resyncPeriod) + scc, err := scyllacluster.NewController( o.kubeClient, o.scyllaClient.ScyllaV1(), @@ -200,7 +211,7 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS o.CQLSIngressPort, ) if err != nil { - return err + return fmt.Errorf("can't create scyllacluster controller: %w", err) } opc, err := orphanedpv.NewController( @@ -211,7 +222,7 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS scyllaInformers.Scylla().V1().ScyllaClusters(), ) if err != nil { - return err + return fmt.Errorf("can't create orphanpv controller: %w", err) } ncc, err := nodeconfig.NewController( @@ -227,6 +238,9 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS kubeInformers.Core().V1().ServiceAccounts(), o.OperatorImage, ) + if err != nil { + return fmt.Errorf("can't create nodeconfig controller: %w", err) + } ncpc, err := nodeconfigpod.NewController( o.kubeClient, @@ -236,12 +250,39 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS kubeInformers.Core().V1().Nodes(), scyllaInformers.Scylla().V1alpha1().NodeConfigs(), ) + if err != nil { + return fmt.Errorf("can't create nodeconfigpod controller: %w", err) + } socc, err := scyllaoperatorconfig.NewController( o.kubeClient, o.scyllaClient.ScyllaV1alpha1(), scyllaOperatorConfigInformers.Scylla().V1alpha1().ScyllaOperatorConfigs(), ) + if err != nil { + return fmt.Errorf("can't create scyllaoperatorconfig controller: %w", err) + } + + mc, err := scylladbmonitoring.NewController( + o.kubeClient, + o.scyllaClient.ScyllaV1alpha1(), + o.monitoringClient.MonitoringV1(), + kubeInformers.Core().V1().ConfigMaps(), + kubeInformers.Core().V1().Secrets(), + kubeInformers.Core().V1().Services(), + kubeInformers.Core().V1().ServiceAccounts(), + kubeInformers.Rbac().V1().RoleBindings(), + kubeInformers.Policy().V1().PodDisruptionBudgets(), + kubeInformers.Apps().V1().Deployments(), + kubeInformers.Networking().V1().Ingresses(), + scyllaInformers.Scylla().V1alpha1().ScyllaDBMonitorings(), + monitoringInformers.Monitoring().V1().Prometheuses(), + monitoringInformers.Monitoring().V1().PrometheusRules(), + monitoringInformers.Monitoring().V1().ServiceMonitors(), + ) + if err != nil { + return fmt.Errorf("can't create scylladbmonitoring controller: %w", err) + } var wg sync.WaitGroup defer wg.Wait() @@ -264,6 +305,12 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS scyllaOperatorConfigInformers.Start(ctx.Done()) }() + wg.Add(1) + go func() { + defer wg.Done() + monitoringInformers.Start(ctx.Done()) + }() + wg.Add(1) go func() { defer wg.Done() @@ -294,6 +341,12 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS socc.Run(ctx, o.ConcurrentSyncs) }() + wg.Add(1) + go func() { + defer wg.Done() + mc.Run(ctx, o.ConcurrentSyncs) + }() + <-ctx.Done() return nil diff --git a/pkg/controller/scylladbmonitoring/conditions.go b/pkg/controller/scylladbmonitoring/conditions.go new file mode 100644 index 00000000000..c7e6ffb189c --- /dev/null +++ b/pkg/controller/scylladbmonitoring/conditions.go @@ -0,0 +1,8 @@ +package scylladbmonitoring + +const ( + prometheusControllerProgressingCondition = "PrometheusControllerProgressing" + prometheusControllerDegradedCondition = "PrometheusControllerDegraded" + grafanaControllerProgressingCondition = "GrafanaControllerProgressing" + grafanaControllerDegradedCondition = "GrafanaControllerDegraded" +) diff --git a/pkg/controller/scylladbmonitoring/controller.go b/pkg/controller/scylladbmonitoring/controller.go new file mode 100644 index 00000000000..762061949ee --- /dev/null +++ b/pkg/controller/scylladbmonitoring/controller.go @@ -0,0 +1,560 @@ +package scylladbmonitoring + +import ( + "context" + "fmt" + "sync" + "time" + + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + scyllav1alpha1client "github.com/scylladb/scylla-operator/pkg/client/scylla/clientset/versioned/typed/scylla/v1alpha1" + scyllav1alpha1informers "github.com/scylladb/scylla-operator/pkg/client/scylla/informers/externalversions/scylla/v1alpha1" + scyllav1alpha1listers "github.com/scylladb/scylla-operator/pkg/client/scylla/listers/scylla/v1alpha1" + "github.com/scylladb/scylla-operator/pkg/controllerhelpers" + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + monitoringv1client "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned/typed/monitoring/v1" + monitoringv1informers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/informers/externalversions/monitoring/v1" + monitoringv1listers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/listers/monitoring/v1" + "github.com/scylladb/scylla-operator/pkg/kubeinterfaces" + "github.com/scylladb/scylla-operator/pkg/scheme" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + policyv1 "k8s.io/api/policy/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + appsv1informers "k8s.io/client-go/informers/apps/v1" + corev1informers "k8s.io/client-go/informers/core/v1" + networkingv1informers "k8s.io/client-go/informers/networking/v1" + policyv1informers "k8s.io/client-go/informers/policy/v1" + rbacv1informers "k8s.io/client-go/informers/rbac/v1" + "k8s.io/client-go/kubernetes" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + appsv1listers "k8s.io/client-go/listers/apps/v1" + corev1listers "k8s.io/client-go/listers/core/v1" + networkingv1listers "k8s.io/client-go/listers/networking/v1" + policyv1listers "k8s.io/client-go/listers/policy/v1" + rbacv1listers "k8s.io/client-go/listers/rbac/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + "k8s.io/component-base/metrics/prometheus/ratelimiter" + "k8s.io/klog/v2" +) + +const ( + ControllerName = "ScyllaDBMonitoringController" +) + +var ( + keyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc + scylladbMonitoringControllerGVK = scyllav1alpha1.GroupVersion.WithKind("ScyllaDBMonitoring") +) + +type Controller struct { + kubeClient kubernetes.Interface + scyllaV1alpha1Client scyllav1alpha1client.ScyllaV1alpha1Interface + monitoringClient monitoringv1client.MonitoringV1Interface + + configMapLister corev1listers.ConfigMapLister + secretLister corev1listers.SecretLister + serviceLister corev1listers.ServiceLister + serviceAccountLister corev1listers.ServiceAccountLister + roleBindingLister rbacv1listers.RoleBindingLister + pdbLister policyv1listers.PodDisruptionBudgetLister + deploymentLister appsv1listers.DeploymentLister + ingressLister networkingv1listers.IngressLister + + scylladbMonitoringLister scyllav1alpha1listers.ScyllaDBMonitoringLister + + prometheusLister monitoringv1listers.PrometheusLister + prometheusRuleLister monitoringv1listers.PrometheusRuleLister + serviceMonitorLister monitoringv1listers.ServiceMonitorLister + + cachesToSync []cache.InformerSynced + + eventRecorder record.EventRecorder + + queue workqueue.RateLimitingInterface + handlers *controllerhelpers.Handlers[*scyllav1alpha1.ScyllaDBMonitoring] +} + +func NewController( + kubeClient kubernetes.Interface, + scyllaV1alpha1Client scyllav1alpha1client.ScyllaV1alpha1Interface, + monitoringClient monitoringv1client.MonitoringV1Interface, + configMapInformer corev1informers.ConfigMapInformer, + secretInformer corev1informers.SecretInformer, + serviceInformer corev1informers.ServiceInformer, + serviceAccountInformer corev1informers.ServiceAccountInformer, + roleBindingInformer rbacv1informers.RoleBindingInformer, + pdbInformer policyv1informers.PodDisruptionBudgetInformer, + deploymentInformer appsv1informers.DeploymentInformer, + ingressInformer networkingv1informers.IngressInformer, + scyllaDBMonitoringInformer scyllav1alpha1informers.ScyllaDBMonitoringInformer, + prometheusInformer monitoringv1informers.PrometheusInformer, + prometheusRuleInformer monitoringv1informers.PrometheusRuleInformer, + serviceMonitorInformer monitoringv1informers.ServiceMonitorInformer, +) (*Controller, error) { + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartStructuredLogging(0) + eventBroadcaster.StartRecordingToSink(&corev1client.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + + if kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil { + err := ratelimiter.RegisterMetricAndTrackRateLimiterUsage( + "scylladbmonitoring_controller", + kubeClient.CoreV1().RESTClient().GetRateLimiter(), + ) + if err != nil { + return nil, err + } + } + + smc := &Controller{ + kubeClient: kubeClient, + scyllaV1alpha1Client: scyllaV1alpha1Client, + monitoringClient: monitoringClient, + + secretLister: secretInformer.Lister(), + configMapLister: configMapInformer.Lister(), + serviceLister: serviceInformer.Lister(), + serviceAccountLister: serviceAccountInformer.Lister(), + roleBindingLister: roleBindingInformer.Lister(), + pdbLister: pdbInformer.Lister(), + deploymentLister: deploymentInformer.Lister(), + ingressLister: ingressInformer.Lister(), + + scylladbMonitoringLister: scyllaDBMonitoringInformer.Lister(), + + prometheusLister: prometheusInformer.Lister(), + prometheusRuleLister: prometheusRuleInformer.Lister(), + serviceMonitorLister: serviceMonitorInformer.Lister(), + + cachesToSync: []cache.InformerSynced{ + secretInformer.Informer().HasSynced, + configMapInformer.Informer().HasSynced, + serviceInformer.Informer().HasSynced, + serviceAccountInformer.Informer().HasSynced, + roleBindingInformer.Informer().HasSynced, + pdbInformer.Informer().HasSynced, + deploymentInformer.Informer().HasSynced, + ingressInformer.Informer().HasSynced, + + scyllaDBMonitoringInformer.Informer().HasSynced, + + prometheusInformer.Informer().HasSynced, + prometheusRuleInformer.Informer().HasSynced, + serviceMonitorInformer.Informer().HasSynced, + }, + + eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "scylladbmonitoring-controller"}), + + queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "scylladbmonitoring"), + } + + var err error + smc.handlers, err = controllerhelpers.NewHandlers[*scyllav1alpha1.ScyllaDBMonitoring]( + smc.queue, + keyFunc, + scheme.Scheme, + scylladbMonitoringControllerGVK, + kubeinterfaces.NamespacedGetList[*scyllav1alpha1.ScyllaDBMonitoring]{ + GetFunc: func(namespace, name string) (*scyllav1alpha1.ScyllaDBMonitoring, error) { + return smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).Get(name) + }, + ListFunc: func(namespace string, selector labels.Selector) (ret []*scyllav1alpha1.ScyllaDBMonitoring, err error) { + return smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).List(selector) + }, + }, + ) + if err != nil { + return nil, fmt.Errorf("can't create handlers: %w", err) + } + + scyllaDBMonitoringInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addScyllaDBMonitoring, + UpdateFunc: smc.updateScyllaDBMonitoring, + DeleteFunc: smc.deleteScyllaDBMonitoring, + }) + + configMapInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addConfigMap, + UpdateFunc: smc.updateConfigMap, + DeleteFunc: smc.deleteConfigMap, + }) + + secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addSecret, + UpdateFunc: smc.updateSecret, + DeleteFunc: smc.deleteSecret, + }) + + serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addService, + UpdateFunc: smc.updateService, + DeleteFunc: smc.deleteService, + }) + + serviceAccountInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addServiceAccount, + UpdateFunc: smc.updateServiceAccount, + DeleteFunc: smc.deleteServiceAccount, + }) + + pdbInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addPodDisruptionBudget, + UpdateFunc: smc.updatePodDisruptionBudget, + DeleteFunc: smc.deletePodDisruptionBudget, + }) + + deploymentInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addDeployment, + UpdateFunc: smc.updateDeployment, + DeleteFunc: smc.deleteDeployment, + }) + + ingressInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addIngress, + UpdateFunc: smc.updateIngress, + DeleteFunc: smc.deleteIngress, + }) + + prometheusInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addPrometheus, + UpdateFunc: smc.updatePrometheus, + DeleteFunc: smc.deletePrometheus, + }) + + prometheusRuleInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addPrometheusRule, + UpdateFunc: smc.updatePrometheusRule, + DeleteFunc: smc.deletePrometheusRule, + }) + + serviceMonitorInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: smc.addServiceMonitor, + UpdateFunc: smc.updateServiceMonitor, + DeleteFunc: smc.deleteServiceMonitor, + }) + + return smc, nil +} + +func (smc *Controller) addScyllaDBMonitoring(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*scyllav1alpha1.ScyllaDBMonitoring), + smc.handlers.Enqueue, + ) +} + +func (smc *Controller) updateScyllaDBMonitoring(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*scyllav1alpha1.ScyllaDBMonitoring), + cur.(*scyllav1alpha1.ScyllaDBMonitoring), + smc.handlers.Enqueue, + smc.deleteScyllaDBMonitoring, + ) +} + +func (smc *Controller) deleteScyllaDBMonitoring(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.Enqueue, + ) +} + +func (smc *Controller) addConfigMap(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*corev1.ConfigMap), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateConfigMap(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*corev1.ConfigMap), + cur.(*corev1.ConfigMap), + smc.handlers.EnqueueOwner, + smc.deleteConfigMap, + ) +} + +func (smc *Controller) deleteConfigMap(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addSecret(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*corev1.Secret), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateSecret(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*corev1.Secret), + cur.(*corev1.Secret), + smc.handlers.EnqueueOwner, + smc.deleteSecret, + ) +} + +func (smc *Controller) deleteSecret(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addService(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*corev1.Service), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateService(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*corev1.Service), + cur.(*corev1.Service), + smc.handlers.EnqueueOwner, + smc.deleteService, + ) +} + +func (smc *Controller) deleteService(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addServiceAccount(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*corev1.ServiceAccount), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateServiceAccount(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*corev1.ServiceAccount), + cur.(*corev1.ServiceAccount), + smc.handlers.EnqueueOwner, + smc.deleteServiceAccount, + ) +} + +func (smc *Controller) deleteServiceAccount(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addPodDisruptionBudget(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*policyv1.PodDisruptionBudget), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updatePodDisruptionBudget(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*policyv1.PodDisruptionBudget), + cur.(*policyv1.PodDisruptionBudget), + smc.handlers.EnqueueOwner, + smc.deletePodDisruptionBudget, + ) +} + +func (smc *Controller) deletePodDisruptionBudget(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addDeployment(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*appsv1.Deployment), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateDeployment(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*appsv1.Deployment), + cur.(*appsv1.Deployment), + smc.handlers.EnqueueOwner, + smc.deleteDeployment, + ) +} + +func (smc *Controller) deleteDeployment(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addIngress(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*networkingv1.Ingress), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateIngress(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*networkingv1.Ingress), + cur.(*networkingv1.Ingress), + smc.handlers.EnqueueOwner, + smc.deleteIngress, + ) +} + +func (smc *Controller) deleteIngress(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addPrometheus(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*monitoringv1.Prometheus), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updatePrometheus(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*monitoringv1.Prometheus), + cur.(*monitoringv1.Prometheus), + smc.handlers.EnqueueOwner, + smc.deletePrometheus, + ) +} + +func (smc *Controller) deletePrometheus(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addPrometheusRule(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*monitoringv1.PrometheusRule), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updatePrometheusRule(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*monitoringv1.PrometheusRule), + cur.(*monitoringv1.PrometheusRule), + smc.handlers.EnqueueOwner, + smc.deletePrometheusRule, + ) +} + +func (smc *Controller) deletePrometheusRule(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) addServiceMonitor(obj interface{}) { + smc.handlers.HandleAdd( + obj.(*monitoringv1.ServiceMonitor), + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) updateServiceMonitor(old, cur interface{}) { + smc.handlers.HandleUpdate( + old.(*monitoringv1.ServiceMonitor), + cur.(*monitoringv1.ServiceMonitor), + smc.handlers.EnqueueOwner, + smc.deleteServiceMonitor, + ) +} + +func (smc *Controller) deleteServiceMonitor(obj interface{}) { + smc.handlers.HandleDelete( + obj, + smc.handlers.EnqueueOwner, + ) +} + +func (smc *Controller) processNextItem(ctx context.Context) bool { + key, quit := smc.queue.Get() + if quit { + return false + } + defer smc.queue.Done(key) + + err := smc.sync(ctx, key.(string)) + // TODO: Do smarter filtering then just Reduce to handle cases like 2 conflict errors. + err = utilerrors.Reduce(err) + switch { + case err == nil: + smc.queue.Forget(key) + return true + + case apierrors.IsConflict(err): + klog.V(2).InfoS("Hit conflict, will retry in a bit", "Key", key, "Error", err) + + case apierrors.IsAlreadyExists(err): + klog.V(2).InfoS("Hit already exists, will retry in a bit", "Key", key, "Error", err) + + default: + utilruntime.HandleError(fmt.Errorf("syncing key '%v' failed: %v", key, err)) + } + + smc.queue.AddRateLimited(key) + + return true +} + +func (smc *Controller) runWorker(ctx context.Context) { + for smc.processNextItem(ctx) { + } +} + +func (smc *Controller) Run(ctx context.Context, workers int) { + defer utilruntime.HandleCrash() + + klog.InfoS("Starting controller", "controller", "ScyllaDBMonitoring") + + var wg sync.WaitGroup + defer func() { + klog.InfoS("Shutting down controller", "controller", "ScyllaDBMonitoring") + smc.queue.ShutDown() + wg.Wait() + klog.InfoS("Shut down controller", "controller", "ScyllaDBMonitoring") + }() + + if !cache.WaitForNamedCacheSync(ControllerName, ctx.Done(), smc.cachesToSync...) { + return + } + + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + wait.UntilWithContext(ctx, smc.runWorker, time.Second) + }() + } + + <-ctx.Done() +} diff --git a/pkg/controller/scylladbmonitoring/status.go b/pkg/controller/scylladbmonitoring/status.go new file mode 100644 index 00000000000..565799c2152 --- /dev/null +++ b/pkg/controller/scylladbmonitoring/status.go @@ -0,0 +1,41 @@ +package scylladbmonitoring + +import ( + "context" + + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/utils/pointer" +) + +// calculateStatus calculates the ScyllaDBMonitoring status. +// This function should always succeed. Do not return an error. +// If a particular object can be missing, it should be reflected in the value itself, like "Unknown" or "". +func (smc *Controller) calculateStatus(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.ScyllaDBMonitoringStatus { + status := sm.Status.DeepCopy() + status.ObservedGeneration = pointer.Int64(sm.Generation) + + return status +} + +func (smc *Controller) updateStatus(ctx context.Context, currentSM *scyllav1alpha1.ScyllaDBMonitoring, status *scyllav1alpha1.ScyllaDBMonitoringStatus) error { + if apiequality.Semantic.DeepEqual(¤tSM.Status, status) { + return nil + } + + sm := currentSM.DeepCopy() + sm.Status = *status + + klog.V(2).InfoS("Updating status", "ScyllaDBMonitoring", klog.KObj(sm)) + + _, err := smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).UpdateStatus(ctx, sm, metav1.UpdateOptions{}) + if err != nil { + return err + } + + klog.V(2).InfoS("Status updated", "ScyllaDBMonitoring", klog.KObj(sm)) + + return nil +} diff --git a/pkg/controller/scylladbmonitoring/sync.go b/pkg/controller/scylladbmonitoring/sync.go new file mode 100644 index 00000000000..03b5c1ca9bc --- /dev/null +++ b/pkg/controller/scylladbmonitoring/sync.go @@ -0,0 +1,283 @@ +package scylladbmonitoring + +import ( + "context" + "fmt" + "time" + + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + "github.com/scylladb/scylla-operator/pkg/controllerhelpers" + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + "github.com/scylladb/scylla-operator/pkg/naming" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" +) + +func getLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set { + return labels.Set{ + naming.ScyllaDBMonitoringNameLabel: sm.Name, + } +} + +func getSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector { + return labels.SelectorFromSet(getLabels(sm)) +} + +func (smc *Controller) sync(ctx context.Context, key string) error { + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return fmt.Errorf("can't split meta namespace cache key %q: %w", key, err) + } + + startTime := time.Now() + klog.V(4).InfoS("Started syncing ScyllaDBMonitoring", "ScyllaDBMonitoring", klog.KRef(namespace, name), "startTime", startTime) + defer func() { + klog.V(4).InfoS("Finished syncing ScyllaDBMonitoring", "ScyllaDBMonitoring", klog.KRef(namespace, name), "duration", time.Since(startTime)) + }() + + sm, err := smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).Get(name) + if errors.IsNotFound(err) { + klog.V(2).InfoS("ScyllaDBMonitoring has been deleted", "ScyllaDBMonitoring", klog.KObj(sm)) + return nil + } + if err != nil { + return fmt.Errorf("can't get object %q from cache: %w", naming.ManualRef(namespace, name), err) + } + + smSelector := getSelector(sm) + + type CT = *scyllav1alpha1.ScyllaDBMonitoring + var objectErrs []error + + configMaps, err := controllerhelpers.GetObjects[CT, *corev1.ConfigMap]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.ConfigMap]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.configMapLister.ConfigMaps(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get config maps: %w", err)) + } + + secrets, err := controllerhelpers.GetObjects[CT, *corev1.Secret]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.Secret]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.secretLister.Secrets(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get secrets: %w", err)) + } + + services, err := controllerhelpers.GetObjects[CT, *corev1.Service]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.Service]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.serviceLister.Services(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get services: %w", err)) + } + + serviceAccounts, err := controllerhelpers.GetObjects[CT, *corev1.ServiceAccount]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.ServiceAccount]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get service accounts: %w", err)) + } + + roleBindings, err := controllerhelpers.GetObjects[CT, *rbacv1.RoleBinding]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *rbacv1.RoleBinding]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.roleBindingLister.RoleBindings(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get role bindings: %w", err)) + } + + deployments, err := controllerhelpers.GetObjects[CT, *appsv1.Deployment]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *appsv1.Deployment]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.deploymentLister.Deployments(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get deployments: %w", err)) + } + + ingresses, err := controllerhelpers.GetObjects[CT, *networkingv1.Ingress]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *networkingv1.Ingress]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.ingressLister.Ingresses(sm.Namespace).List, + PatchObjectFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get ingresses: %w", err)) + } + + prometheuses, err := controllerhelpers.GetObjects[CT, *monitoringv1.Prometheus]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.Prometheus]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.prometheusLister.Prometheuses(sm.Namespace).List, + PatchObjectFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get prometheuses: %w", err)) + } + + prometheusRules, err := controllerhelpers.GetObjects[CT, *monitoringv1.PrometheusRule]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.PrometheusRule]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).List, + PatchObjectFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get prometheus rules: %w", err)) + } + + serviceMonitors, err := controllerhelpers.GetObjects[CT, *monitoringv1.ServiceMonitor]( + ctx, + sm, + scylladbMonitoringControllerGVK, + smSelector, + controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.ServiceMonitor]{ + GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get, + ListObjectsFunc: smc.serviceMonitorLister.ServiceMonitors(sm.Namespace).List, + PatchObjectFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Patch, + }, + ) + if err != nil { + objectErrs = append(objectErrs, fmt.Errorf("can't get service monitors: %w", err)) + } + + objectErr := utilerrors.NewAggregate(objectErrs) + if objectErr != nil { + return objectErr + } + + prometheusSelector := getPrometheusSelector(sm) + grafanaSelector := getGrafanaSelector(sm) + + status := smc.calculateStatus(sm) + + if sm.DeletionTimestamp != nil { + return smc.updateStatus(ctx, sm, status) + } + + var errs []error + + err = controllerhelpers.RunSync( + &status.Conditions, + prometheusControllerProgressingCondition, + prometheusControllerDegradedCondition, + sm.Generation, + func() ([]metav1.Condition, error) { + return smc.syncPrometheus( + ctx, + sm, + controllerhelpers.FilterObjectMapByLabel(configMaps, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(secrets, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(services, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(serviceAccounts, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(roleBindings, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(ingresses, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(prometheuses, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(prometheusRules, prometheusSelector), + controllerhelpers.FilterObjectMapByLabel(serviceMonitors, prometheusSelector), + ) + }, + ) + if err != nil { + errs = append(errs, fmt.Errorf("can't sync prometheus: %w", err)) + } + + err = controllerhelpers.RunSync( + &status.Conditions, + grafanaControllerProgressingCondition, + grafanaControllerDegradedCondition, + sm.Generation, + func() ([]metav1.Condition, error) { + return smc.syncGrafana( + ctx, + sm, + controllerhelpers.FilterObjectMapByLabel(configMaps, grafanaSelector), + controllerhelpers.FilterObjectMapByLabel(secrets, grafanaSelector), + controllerhelpers.FilterObjectMapByLabel(services, grafanaSelector), + controllerhelpers.FilterObjectMapByLabel(serviceAccounts, grafanaSelector), + controllerhelpers.FilterObjectMapByLabel(deployments, grafanaSelector), + controllerhelpers.FilterObjectMapByLabel(ingresses, grafanaSelector), + ) + }, + ) + if err != nil { + errs = append(errs, fmt.Errorf("can't sync grafana: %w", err)) + } + + // Aggregate conditions. + err = controllerhelpers.SetAggregatedWorkloadConditions(&status.Conditions, sm.Generation) + if err != nil { + errs = append(errs, fmt.Errorf("can't aggregate workload conditions: %w", err)) + } else { + err = smc.updateStatus(ctx, sm, status) + errs = append(errs, err) + } + + return utilerrors.NewAggregate(errs) +} diff --git a/pkg/controller/scylladbmonitoring/sync_grafana.go b/pkg/controller/scylladbmonitoring/sync_grafana.go new file mode 100644 index 00000000000..828b577259c --- /dev/null +++ b/pkg/controller/scylladbmonitoring/sync_grafana.go @@ -0,0 +1,509 @@ +package scylladbmonitoring + +import ( + "context" + "crypto/x509/pkix" + "fmt" + "time" + + grafanav1alpha1assets "github.com/scylladb/scylla-operator/assets/monitoring/grafana/v1alpha1" + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + "github.com/scylladb/scylla-operator/pkg/controllerhelpers" + ocrypto "github.com/scylladb/scylla-operator/pkg/crypto" + "github.com/scylladb/scylla-operator/pkg/helpers" + okubecrypto "github.com/scylladb/scylla-operator/pkg/kubecrypto" + "github.com/scylladb/scylla-operator/pkg/naming" + "github.com/scylladb/scylla-operator/pkg/resource" + "github.com/scylladb/scylla-operator/pkg/resourceapply" + "github.com/scylladb/scylla-operator/pkg/resourcemerge" + "github.com/scylladb/scylla-operator/pkg/util/hash" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + kutilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/pointer" +) + +const ( + grafanaPasswordLength = 20 +) + +func getGrafanaLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set { + return helpers.MergeMaps( + getLabels(sm), + labels.Set{ + naming.ControllerNameLabel: "grafana", + }, + ) +} + +func getGrafanaSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector { + return labels.SelectorFromSet(getGrafanaLabels(sm)) +} + +func getGrafanaSpec(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.GrafanaSpec { + if sm.Spec.Components != nil { + return sm.Spec.Components.Grafana + } + + return nil +} + +func getGrafanaIngressOptions(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.IngressOptions { + spec := getGrafanaSpec(sm) + if spec != nil && + spec.ExposeOptions != nil && + spec.ExposeOptions.WebInterface != nil { + return spec.ExposeOptions.WebInterface.Ingress + } + + return nil +} + +func getGrafanaIngressDomains(sm *scyllav1alpha1.ScyllaDBMonitoring) []string { + ingressOptions := getGrafanaIngressOptions(sm) + if ingressOptions != nil { + return ingressOptions.DNSDomains + } + + return nil +} + +func makeGrafanaDeployment(sm *scyllav1alpha1.ScyllaDBMonitoring, grafanaServingCertSecretName string, restartTriggerHash string) (*appsv1.Deployment, string, error) { + spec := getGrafanaSpec(sm) + + var affinity corev1.Affinity + var tolerations []corev1.Toleration + if spec != nil && spec.Placement != nil { + affinity.NodeAffinity = spec.Placement.NodeAffinity + affinity.PodAffinity = spec.Placement.PodAffinity + affinity.PodAntiAffinity = spec.Placement.PodAntiAffinity + + tolerations = spec.Placement.Tolerations + } + + var resources corev1.ResourceRequirements + if spec != nil { + resources = spec.Resources + } + + return grafanav1alpha1assets.GrafanaDeploymentTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + "servingCertSecretName": grafanaServingCertSecretName, + "affinity": affinity, + "tolerations": tolerations, + "resources": resources, + "restartTriggerHash": restartTriggerHash, + }) +} + +func makeGrafanaAdminCredentials(sm *scyllav1alpha1.ScyllaDBMonitoring, secrets map[string]*corev1.Secret) (*corev1.Secret, string, error) { + var existingPassword []byte + + secretName := sm.Name + "-grafana-admin-credentials" + existingSecret, found := secrets[secretName] + if found { + existingPassword = existingSecret.Data["password"] + } + + if len(existingPassword) == 0 { + existingPassword = []byte(rand.String(grafanaPasswordLength)) + } + + return grafanav1alpha1assets.GrafanaAdminCredentialsSecretTemplate.RenderObject(map[string]any{ + "name": secretName, + "password": existingPassword, + }) +} + +func makeGrafanaSA(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ServiceAccount, string, error) { + return grafanav1alpha1assets.GrafanaSATemplate.RenderObject(map[string]any{ + "namespace": sm.Namespace, + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeGrafanaConfigs(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) { + enableAnonymousAccess := false + spec := getGrafanaSpec(sm) + if spec != nil { + enableAnonymousAccess = spec.Authentication.InsecureEnableAnonymousAccess + } + + return grafanav1alpha1assets.GrafanaConfigsTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + "enableAnonymousAccess": enableAnonymousAccess, + }) +} + +func makeGrafanaDashboards(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) { + return grafanav1alpha1assets.GrafanaDashboardsConfigMapTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeGrafanaProvisionings(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) { + return grafanav1alpha1assets.GrafanaProvisioningConfigMapTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeGrafanaService(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.Service, string, error) { + return grafanav1alpha1assets.GrafanaServiceTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeGrafanaIngress(sm *scyllav1alpha1.ScyllaDBMonitoring) (*networkingv1.Ingress, string, error) { + ingressOptions := getGrafanaIngressOptions(sm) + if ingressOptions == nil { + return nil, "", nil + } + + if ingressOptions.Disabled != nil && *ingressOptions.Disabled == true { + return nil, "", nil + } + + if len(ingressOptions.DNSDomains) == 0 { + return nil, "", nil + } + + return grafanav1alpha1assets.GrafanaIngressTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + "dnsDomains": ingressOptions.DNSDomains, + "ingressAnnotations": ingressOptions.Annotations, + "ingressClassName": ingressOptions.IngressClassName, + }) +} + +func (smc *Controller) syncGrafana( + ctx context.Context, + sm *scyllav1alpha1.ScyllaDBMonitoring, + configMaps map[string]*corev1.ConfigMap, + secrets map[string]*corev1.Secret, + services map[string]*corev1.Service, + serviceAccounts map[string]*corev1.ServiceAccount, + deployments map[string]*appsv1.Deployment, + ingresses map[string]*networkingv1.Ingress, +) ([]metav1.Condition, error) { + var progressingConditions []metav1.Condition + + grafanaServingCertChainConfig := &okubecrypto.CertChainConfig{ + CAConfig: &okubecrypto.CAConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-grafana-serving-ca", sm.Name), + Labels: getGrafanaLabels(sm), + }, + Validity: 10 * 365 * 24 * time.Hour, + Refresh: 8 * 365 * 24 * time.Hour, + }, + CABundleConfig: &okubecrypto.CABundleConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-grafana-serving-ca", sm.Name), + Labels: getGrafanaLabels(sm), + }, + }, + CertConfigs: []*okubecrypto.CertificateConfig{ + { + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-grafana-serving-certs", sm.Name), + Labels: getGrafanaLabels(sm), + }, + Validity: 30 * 24 * time.Hour, + Refresh: 20 * 24 * time.Hour, + CertCreator: (&ocrypto.ServingCertCreatorConfig{ + Subject: pkix.Name{ + CommonName: "", + }, + IPAddresses: nil, + DNSNames: append( + []string{ + sm.Name + "-grafana", + }, + getGrafanaIngressDomains(sm)..., + ), + }).ToCreator(), + }, + }, + } + + var certChainConfigs okubecrypto.CertChainConfigs + + spec := getGrafanaSpec(sm) + + var grafanaServingCertSecretName string + if spec != nil { + grafanaServingCertSecretName = spec.ServingCertSecretName + } + + if len(grafanaServingCertSecretName) == 0 { + grafanaServingCertSecretName = grafanaServingCertChainConfig.CertConfigs[0].Name + certChainConfigs = append(certChainConfigs, grafanaServingCertChainConfig) + } + + // Render manifests. + var renderErrors []error + + requiredGrafanaSA, _, err := makeGrafanaSA(sm) + renderErrors = append(renderErrors, err) + + requiredConfigsCM, _, err := makeGrafanaConfigs(sm) + renderErrors = append(renderErrors, err) + + requiredDahsboardsCM, _, err := makeGrafanaDashboards(sm) + renderErrors = append(renderErrors, err) + + requiredProvisioningsCM, _, err := makeGrafanaProvisionings(sm) + renderErrors = append(renderErrors, err) + + requiredAdminCredentialsSecret, _, err := makeGrafanaAdminCredentials(sm, secrets) + renderErrors = append(renderErrors, err) + + var requiredDeployment *appsv1.Deployment + // Trigger restart for inputs that are not live reloaded. + grafanaRestartHash, hashErr := hash.HashObjects(requiredConfigsCM, requiredProvisioningsCM) + if hashErr != nil { + renderErrors = append(renderErrors, hashErr) + } else { + requiredDeployment, _, err = makeGrafanaDeployment(sm, grafanaServingCertSecretName, grafanaRestartHash) + renderErrors = append(renderErrors, err) + } + + requiredService, _, err := makeGrafanaService(sm) + renderErrors = append(renderErrors, err) + + requiredIngress, _, err := makeGrafanaIngress(sm) + renderErrors = append(renderErrors, err) + + renderError := kutilerrors.NewAggregate(renderErrors) + if renderError != nil { + return progressingConditions, renderError + } + + // Prune objects. + var pruneErrors []error + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredGrafanaSA), + serviceAccounts, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + append( + []*corev1.ConfigMap{ + requiredConfigsCM, + requiredDahsboardsCM, + requiredProvisioningsCM, + }, + certChainConfigs.GetMetaConfigMaps()..., + ), + configMaps, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + append([]*corev1.Secret{requiredAdminCredentialsSecret}, certChainConfigs.GetMetaSecrets()...), + secrets, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredService), + services, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredDeployment), + deployments, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.FilterOutNil(helpers.ToArray(requiredIngress)), + ingresses, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + pruneError := kutilerrors.NewAggregate(pruneErrors) + if pruneError != nil { + return progressingConditions, pruneError + } + + // Apply required objects. + var applyErrors []error + applyConfigurations := []resourceapply.ApplyConfigUntyped{ + resourceapply.ApplyConfig[*corev1.ServiceAccount]{ + Required: requiredGrafanaSA, + Control: resourceapply.ApplyControlFuncs[*corev1.ServiceAccount]{ + GetCachedFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.ConfigMap]{ + Required: requiredConfigsCM, + Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{ + GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.ConfigMap]{ + Required: requiredDahsboardsCM, + Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{ + GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.ConfigMap]{ + Required: requiredProvisioningsCM, + Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{ + GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.Secret]{ + Required: requiredAdminCredentialsSecret, + Control: resourceapply.ApplyControlFuncs[*corev1.Secret]{ + GetCachedFunc: smc.secretLister.Secrets(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*appsv1.Deployment]{ + Required: requiredDeployment, + Control: resourceapply.ApplyControlFuncs[*appsv1.Deployment]{ + GetCachedFunc: smc.deploymentLister.Deployments(sm.Namespace).Get, + CreateFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.Service]{ + Required: requiredService, + Control: resourceapply.ApplyControlFuncs[*corev1.Service]{ + GetCachedFunc: smc.serviceLister.Services(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete, + }, + }.ToUntyped(), + } + + if requiredIngress != nil { + applyConfigurations = append(applyConfigurations, resourceapply.ApplyConfig[*networkingv1.Ingress]{ + Required: requiredIngress, + Control: resourceapply.ApplyControlFuncs[*networkingv1.Ingress]{ + GetCachedFunc: smc.ingressLister.Ingresses(sm.Namespace).Get, + CreateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete, + }, + }.ToUntyped()) + } + + for _, cfg := range applyConfigurations { + // Enforce namespace. + cfg.Required.SetNamespace(sm.Namespace) + + // Enforce labels for selection. + if cfg.Required.GetLabels() == nil { + cfg.Required.SetLabels(getGrafanaLabels(sm)) + } else { + resourcemerge.MergeMapInPlaceWithoutRemovalKeys(cfg.Required.GetLabels(), getGrafanaLabels(sm)) + } + + // Set ControllerRef. + cfg.Required.SetOwnerReferences([]metav1.OwnerReference{ + { + APIVersion: scylladbMonitoringControllerGVK.GroupVersion().String(), + Kind: scylladbMonitoringControllerGVK.Kind, + Name: sm.Name, + UID: sm.UID, + Controller: pointer.Bool(true), + BlockOwnerDeletion: pointer.Bool(true), + }, + }) + + // Apply required object. + _, changed, err := resourceapply.ApplyFromConfig(ctx, cfg, smc.eventRecorder) + if changed { + controllerhelpers.AddGenericProgressingStatusCondition(&progressingConditions, grafanaControllerProgressingCondition, cfg.Required, "apply", sm.Generation) + } + if err != nil { + gvk := resource.GetObjectGVKOrUnknown(cfg.Required) + applyErrors = append(applyErrors, fmt.Errorf("can't apply %s: %w", gvk, err)) + } + } + + cm := okubecrypto.NewCertificateManager( + smc.kubeClient.CoreV1(), + smc.secretLister, + smc.kubeClient.CoreV1(), + smc.configMapLister, + smc.eventRecorder, + ) + for _, ccc := range certChainConfigs { + err := cm.ManageCertificateChain( + ctx, + time.Now, + &sm.ObjectMeta, + scylladbMonitoringControllerGVK, + ccc, + secrets, + configMaps, + ) + if err != nil { + applyErrors = append(applyErrors, err) + } + } + + applyError := kutilerrors.NewAggregate(applyErrors) + if applyError != nil { + return progressingConditions, applyError + } + + return progressingConditions, nil +} diff --git a/pkg/controller/scylladbmonitoring/sync_grafana_test.go b/pkg/controller/scylladbmonitoring/sync_grafana_test.go new file mode 100644 index 00000000000..d52160bca15 --- /dev/null +++ b/pkg/controller/scylladbmonitoring/sync_grafana_test.go @@ -0,0 +1,127 @@ +package scylladbmonitoring + +import ( + "reflect" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func Test_makeGrafanaIngress(t *testing.T) { + tt := []struct { + name string + sm *scyllav1alpha1.ScyllaDBMonitoring + expectedString string + expectedErr error + }{ + { + name: "empty annotations", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{ + Components: &scyllav1alpha1.Components{ + Grafana: &scyllav1alpha1.GrafanaSpec{ + ExposeOptions: &scyllav1alpha1.GrafanaExposeOptions{ + WebInterface: &scyllav1alpha1.HTTPSExposeOptions{ + Ingress: &scyllav1alpha1.IngressOptions{ + DNSDomains: []string{"grafana.localhost"}, + }, + }, + }, + }, + }, + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "sm-name-grafana" + annotations: + null +spec: + ingressClassName: null + rules: + - host: "grafana.localhost" + http: + paths: + - backend: + service: + name: "sm-name-grafana" + port: + number: 3000 + path: / + pathType: Prefix +`, "\n"), + expectedErr: nil, + }, + { + name: "supplied annotations", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{ + Components: &scyllav1alpha1.Components{ + Grafana: &scyllav1alpha1.GrafanaSpec{ + ExposeOptions: &scyllav1alpha1.GrafanaExposeOptions{ + WebInterface: &scyllav1alpha1.HTTPSExposeOptions{ + Ingress: &scyllav1alpha1.IngressOptions{ + Annotations: map[string]string{ + "ann1": "ann1val", + "ann2": "ann2val", + }, + DNSDomains: []string{"grafana.localhost"}, + }, + }, + }, + }, + }, + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "sm-name-grafana" + annotations: + ann1: ann1val + ann2: ann2val +spec: + ingressClassName: null + rules: + - host: "grafana.localhost" + http: + paths: + - backend: + service: + name: "sm-name-grafana" + port: + number: 3000 + path: / + pathType: Prefix +`, "\n"), + expectedErr: nil, + }, + } + for _, tc := range tt { + t.Run(tc.name, func(t *testing.T) { + _, objString, err := makeGrafanaIngress(tc.sm) + if !reflect.DeepEqual(err, tc.expectedErr) { + t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString) + } + + if objString != tc.expectedString { + t.Errorf("expected and got strings differ:\n%s", cmp.Diff( + strings.Split(tc.expectedString, "\n"), + strings.Split(objString, "\n"), + )) + } + }) + } +} diff --git a/pkg/controller/scylladbmonitoring/sync_prometheus.go b/pkg/controller/scylladbmonitoring/sync_prometheus.go new file mode 100644 index 00000000000..6a6f2921da4 --- /dev/null +++ b/pkg/controller/scylladbmonitoring/sync_prometheus.go @@ -0,0 +1,543 @@ +package scylladbmonitoring + +import ( + "context" + "crypto/x509/pkix" + "fmt" + "time" + + prometheusv1assets "github.com/scylladb/scylla-operator/assets/monitoring/prometheus/v1" + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + "github.com/scylladb/scylla-operator/pkg/controllerhelpers" + ocrypto "github.com/scylladb/scylla-operator/pkg/crypto" + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + "github.com/scylladb/scylla-operator/pkg/helpers" + okubecrypto "github.com/scylladb/scylla-operator/pkg/kubecrypto" + "github.com/scylladb/scylla-operator/pkg/naming" + "github.com/scylladb/scylla-operator/pkg/resource" + "github.com/scylladb/scylla-operator/pkg/resourceapply" + "github.com/scylladb/scylla-operator/pkg/resourcemerge" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + kutilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/utils/pointer" +) + +func getPrometheusLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set { + return helpers.MergeMaps( + getLabels(sm), + labels.Set{ + naming.ControllerNameLabel: "prometheus", + }, + ) +} + +func getPrometheusSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector { + return labels.SelectorFromSet(getPrometheusLabels(sm)) +} + +func getPrometheusSpec(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.PrometheusSpec { + if sm.Spec.Components != nil { + return sm.Spec.Components.Prometheus + } + + return nil +} + +func getPrometheusIngressOptions(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.IngressOptions { + spec := getPrometheusSpec(sm) + if spec != nil && + spec.ExposeOptions != nil && + spec.ExposeOptions.WebInterface != nil { + return spec.ExposeOptions.WebInterface.Ingress + } + + return nil +} + +func getPrometheusIngressDomains(sm *scyllav1alpha1.ScyllaDBMonitoring) []string { + ingressOptions := getPrometheusIngressOptions(sm) + if ingressOptions != nil { + return ingressOptions.DNSDomains + } + + return nil +} + +func makePrometheusSA(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ServiceAccount, string, error) { + return prometheusv1assets.PrometheusSATemplate.RenderObject(map[string]any{ + "namespace": sm.Namespace, + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makePrometheusRoleBinding(sm *scyllav1alpha1.ScyllaDBMonitoring) (*rbacv1.RoleBinding, string, error) { + return prometheusv1assets.PrometheusRoleBindingTemplate.RenderObject(map[string]any{ + "namespace": sm.Namespace, + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makePrometheusService(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.Service, string, error) { + return prometheusv1assets.PrometheusServiceTemplate.RenderObject(map[string]any{ + "namespace": sm.Namespace, + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeScyllaDBServiceMonitor(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.ServiceMonitor, string, error) { + return prometheusv1assets.ScyllaDBServiceMonitorTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + "endpointsSelector": sm.Spec.EndpointsSelector, + }) +} + +func makeRecodingPrometheusRule(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.PrometheusRule, string, error) { + return prometheusv1assets.RecordingPrometheusRuleTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makeAlertsPrometheusRule(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.PrometheusRule, string, error) { + return prometheusv1assets.AlertsPrometheusRuleTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + }) +} + +func makePrometheus(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.Prometheus, string, error) { + spec := getPrometheusSpec(sm) + + var volumeClaimTemplate *monitoringv1.EmbeddedPersistentVolumeClaim + if spec != nil && spec.Storage != nil { + volumeClaimTemplate = &monitoringv1.EmbeddedPersistentVolumeClaim{ + EmbeddedObjectMetadata: monitoringv1.EmbeddedObjectMetadata{ + Name: fmt.Sprintf("%s-prometheus", sm.Name), + Labels: spec.Storage.VolumeClaimTemplate.Labels, + Annotations: spec.Storage.VolumeClaimTemplate.Annotations, + }, + Spec: spec.Storage.VolumeClaimTemplate.Spec, + } + + } + + affinity := corev1.Affinity{} + var tolerations []corev1.Toleration + if spec != nil && spec.Placement != nil { + affinity.NodeAffinity = spec.Placement.NodeAffinity + affinity.PodAffinity = spec.Placement.PodAffinity + affinity.PodAntiAffinity = spec.Placement.PodAntiAffinity + + tolerations = spec.Placement.Tolerations + } + + var resources corev1.ResourceRequirements + if spec != nil { + resources = spec.Resources + } + + return prometheusv1assets.PrometheusTemplate.RenderObject(map[string]any{ + "namespace": sm.Namespace, + "scyllaDBMonitoringName": sm.Name, + "volumeClaimTemplate": volumeClaimTemplate, + "affinity": affinity, + "tolerations": tolerations, + "resources": resources, + }) +} + +func makePrometheusIngress(sm *scyllav1alpha1.ScyllaDBMonitoring) (*networkingv1.Ingress, string, error) { + ingressOptions := getPrometheusIngressOptions(sm) + if ingressOptions == nil { + return nil, "", nil + } + + if ingressOptions.Disabled != nil && *ingressOptions.Disabled == true { + return nil, "", nil + } + + if len(ingressOptions.DNSDomains) == 0 { + return nil, "", nil + } + + return prometheusv1assets.PrometheusIngressTemplate.RenderObject(map[string]any{ + "scyllaDBMonitoringName": sm.Name, + "dnsDomains": ingressOptions.DNSDomains, + "ingressAnnotations": ingressOptions.Annotations, + "ingressClassName": ingressOptions.IngressClassName, + }) +} + +func (smc *Controller) syncPrometheus( + ctx context.Context, + sm *scyllav1alpha1.ScyllaDBMonitoring, + configMaps map[string]*corev1.ConfigMap, + secrets map[string]*corev1.Secret, + services map[string]*corev1.Service, + serviceAccounts map[string]*corev1.ServiceAccount, + roleBindings map[string]*rbacv1.RoleBinding, + ingresses map[string]*networkingv1.Ingress, + prometheuses map[string]*monitoringv1.Prometheus, + prometheusRules map[string]*monitoringv1.PrometheusRule, + serviceMonitors map[string]*monitoringv1.ServiceMonitor, +) ([]metav1.Condition, error) { + var progressingConditions []metav1.Condition + + prometheusServingCertChainConfig := &okubecrypto.CertChainConfig{ + CAConfig: &okubecrypto.CAConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-serving-ca", sm.Name), + Labels: getPrometheusLabels(sm), + }, + Validity: 10 * 365 * 24 * time.Hour, + Refresh: 8 * 365 * 24 * time.Hour, + }, + CABundleConfig: &okubecrypto.CABundleConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-serving-ca", sm.Name), + Labels: getPrometheusLabels(sm), + }, + }, + CertConfigs: []*okubecrypto.CertificateConfig{ + { + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-serving-certs", sm.Name), + Labels: getPrometheusLabels(sm), + }, + Validity: 30 * 24 * time.Hour, + Refresh: 20 * 24 * time.Hour, + CertCreator: (&ocrypto.ServingCertCreatorConfig{ + Subject: pkix.Name{ + CommonName: "", + }, + IPAddresses: nil, + DNSNames: append( + []string{ + fmt.Sprintf("%s-prometheus", sm.Name), + fmt.Sprintf("%s-prometheus.%s.svc", sm.Name, sm.Namespace), + }, + getPrometheusIngressDomains(sm)..., + ), + }).ToCreator(), + }, + }, + } + + prometheusClientCertChainConfig := &okubecrypto.CertChainConfig{ + CAConfig: &okubecrypto.CAConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-client-ca", sm.Name), + Labels: getPrometheusLabels(sm), + }, + Validity: 10 * 365 * 24 * time.Hour, + Refresh: 8 * 365 * 24 * time.Hour, + }, + CABundleConfig: &okubecrypto.CABundleConfig{ + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-client-ca", sm.Name), + Labels: getPrometheusLabels(sm), + }, + }, + CertConfigs: []*okubecrypto.CertificateConfig{ + { + MetaConfig: okubecrypto.MetaConfig{ + Name: fmt.Sprintf("%s-prometheus-client-grafana", sm.Name), + Labels: getPrometheusLabels(sm), + }, + Validity: 10 * 365 * 24 * time.Hour, + Refresh: 8 * 365 * 24 * time.Hour, + CertCreator: (&ocrypto.ClientCertCreatorConfig{ + Subject: pkix.Name{ + CommonName: "", + }, + DNSNames: []string{"grafana"}, + }).ToCreator(), + }, + }, + } + + certChainConfigs := okubecrypto.CertChainConfigs{ + prometheusServingCertChainConfig, + prometheusClientCertChainConfig, + } + + // Render manifests. + var renderErrors []error + + requiredPrometheusSA, _, err := makePrometheusSA(sm) + renderErrors = append(renderErrors, err) + + requiredPrometheusRoleBinding, _, err := makePrometheusRoleBinding(sm) + renderErrors = append(renderErrors, err) + + requiredPrometheusService, _, err := makePrometheusService(sm) + renderErrors = append(renderErrors, err) + + requiredIngress, _, err := makePrometheusIngress(sm) + renderErrors = append(renderErrors, err) + + requiredPrometheus, _, err := makePrometheus(sm) + renderErrors = append(renderErrors, err) + + requiredRecodingPrometheusRule, _, err := makeRecodingPrometheusRule(sm) + renderErrors = append(renderErrors, err) + + requiredAlertsPrometheusRule, _, err := makeAlertsPrometheusRule(sm) + renderErrors = append(renderErrors, err) + + requiredScyllaDBServiceMonitor, _, err := makeScyllaDBServiceMonitor(sm) + renderErrors = append(renderErrors, err) + + renderError := kutilerrors.NewAggregate(renderErrors) + if renderError != nil { + return progressingConditions, renderError + } + + // Prune objects. + var pruneErrors []error + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredPrometheusSA), + serviceAccounts, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredPrometheusService), + services, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredPrometheusRoleBinding), + roleBindings, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredPrometheus), + prometheuses, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.FilterOutNil(helpers.ToArray(requiredIngress)), + ingresses, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredRecodingPrometheusRule, requiredAlertsPrometheusRule), + prometheusRules, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + helpers.ToArray(requiredScyllaDBServiceMonitor), + serviceMonitors, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + certChainConfigs.GetMetaSecrets(), + secrets, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + err = controllerhelpers.Prune( + ctx, + certChainConfigs.GetMetaConfigMaps(), + configMaps, + &controllerhelpers.PruneControlFuncs{ + DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete, + }, + smc.eventRecorder, + ) + pruneErrors = append(pruneErrors, err) + + pruneError := kutilerrors.NewAggregate(pruneErrors) + if pruneError != nil { + return progressingConditions, pruneError + } + + // Apply required objects. + var applyErrors []error + applyConfigurations := []resourceapply.ApplyConfigUntyped{ + resourceapply.ApplyConfig[*corev1.ServiceAccount]{ + Required: requiredPrometheusSA, + Control: resourceapply.ApplyControlFuncs[*corev1.ServiceAccount]{ + GetCachedFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*corev1.Service]{ + Required: requiredPrometheusService, + Control: resourceapply.ApplyControlFuncs[*corev1.Service]{ + GetCachedFunc: smc.serviceLister.Services(sm.Namespace).Get, + CreateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Update, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*rbacv1.RoleBinding]{ + Required: requiredPrometheusRoleBinding, + Control: resourceapply.ApplyControlFuncs[*rbacv1.RoleBinding]{ + GetCachedFunc: smc.roleBindingLister.RoleBindings(sm.Namespace).Get, + CreateFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*monitoringv1.Prometheus]{ + Required: requiredPrometheus, + Control: resourceapply.ApplyControlFuncs[*monitoringv1.Prometheus]{ + GetCachedFunc: smc.prometheusLister.Prometheuses(sm.Namespace).Get, + CreateFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Create, + UpdateFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Update, + DeleteFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*monitoringv1.ServiceMonitor]{ + Required: requiredScyllaDBServiceMonitor, + Control: resourceapply.ApplyControlFuncs[*monitoringv1.ServiceMonitor]{ + GetCachedFunc: smc.serviceMonitorLister.ServiceMonitors(sm.Namespace).Get, + CreateFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Create, + UpdateFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Update, + DeleteFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*monitoringv1.PrometheusRule]{ + Required: requiredRecodingPrometheusRule, + Control: resourceapply.ApplyControlFuncs[*monitoringv1.PrometheusRule]{ + GetCachedFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).Get, + CreateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Create, + UpdateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Update, + DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete, + }, + }.ToUntyped(), + resourceapply.ApplyConfig[*monitoringv1.PrometheusRule]{ + Required: requiredAlertsPrometheusRule, + Control: resourceapply.ApplyControlFuncs[*monitoringv1.PrometheusRule]{ + GetCachedFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).Get, + CreateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Create, + UpdateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Update, + DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete, + }, + }.ToUntyped(), + } + + if requiredIngress != nil { + applyConfigurations = append(applyConfigurations, resourceapply.ApplyConfig[*networkingv1.Ingress]{ + Required: requiredIngress, + Control: resourceapply.ApplyControlFuncs[*networkingv1.Ingress]{ + GetCachedFunc: smc.ingressLister.Ingresses(sm.Namespace).Get, + CreateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Create, + UpdateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Update, + DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete, + }, + }.ToUntyped()) + } + + for _, cfg := range applyConfigurations { + // Enforce namespace. + cfg.Required.SetNamespace(sm.Namespace) + + // Enforce labels for selection. + if cfg.Required.GetLabels() == nil { + cfg.Required.SetLabels(getPrometheusLabels(sm)) + } else { + resourcemerge.MergeMapInPlaceWithoutRemovalKeys(cfg.Required.GetLabels(), getPrometheusLabels(sm)) + } + + // Set ControllerRef. + cfg.Required.SetOwnerReferences([]metav1.OwnerReference{ + { + APIVersion: scylladbMonitoringControllerGVK.GroupVersion().String(), + Kind: scylladbMonitoringControllerGVK.Kind, + Name: sm.Name, + UID: sm.UID, + Controller: pointer.Bool(true), + BlockOwnerDeletion: pointer.Bool(true), + }, + }) + + // Apply required object. + _, changed, err := resourceapply.ApplyFromConfig(ctx, cfg, smc.eventRecorder) + if changed { + controllerhelpers.AddGenericProgressingStatusCondition(&progressingConditions, prometheusControllerProgressingCondition, cfg.Required, "apply", sm.Generation) + } + if err != nil { + gvk := resource.GetObjectGVKOrUnknown(cfg.Required) + applyErrors = append(applyErrors, fmt.Errorf("can't apply %s: %w", gvk, err)) + } + } + + cm := okubecrypto.NewCertificateManager( + smc.kubeClient.CoreV1(), + smc.secretLister, + smc.kubeClient.CoreV1(), + smc.configMapLister, + smc.eventRecorder, + ) + for _, ccc := range certChainConfigs { + applyErrors = append(applyErrors, cm.ManageCertificateChain( + ctx, + time.Now, + &sm.ObjectMeta, + scylladbMonitoringControllerGVK, + ccc, + secrets, + configMaps, + )) + } + + applyError := kutilerrors.NewAggregate(applyErrors) + if applyError != nil { + return progressingConditions, applyError + } + + return progressingConditions, nil +} diff --git a/pkg/controller/scylladbmonitoring/sync_prometheus_test.go b/pkg/controller/scylladbmonitoring/sync_prometheus_test.go new file mode 100644 index 00000000000..a921d1dfe9d --- /dev/null +++ b/pkg/controller/scylladbmonitoring/sync_prometheus_test.go @@ -0,0 +1,395 @@ +package scylladbmonitoring + +import ( + "reflect" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +func Test_makeScyllaDBServiceMonitor(t *testing.T) { + tt := []struct { + name string + sm *scyllav1alpha1.ScyllaDBMonitoring + expectedString string + expectedErr error + }{ + { + name: "empty selector", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: "sm-name-scylladb" +spec: + selector: + {} + jobLabel: scylla/cluster + endpoints: + - port: node-exporter + honorLabels: false + relabelings: + - sourceLabels: [__address__] + regex: '(.*):\d+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__address__] + regex: '([^:]+)' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [instance] + regex: '(.*)' + targetLabel: __address__ + replacement: '${1}:9100' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' + - port: prometheus + honorLabels: false + metricRelabelings: + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CPU + replacement: 'cpu' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CQL + replacement: 'cql' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: OS + replacement: 'os' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: IO + replacement: 'io' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: Errors + replacement: 'errors' + - regex: 'help|exported_instance' + action: labeldrop + - sourceLabels: [version] + regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*' + replacement: '$1$2' + targetLabel: svr + relabelings: + - sourceLabels: [__address__] + regex: '(.*):.+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' +`, "\n"), + expectedErr: nil, + }, + { + name: "specific selector", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{ + EndpointsSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "alpha", + Operator: metav1.LabelSelectorOpExists, + Values: []string{"beta"}, + }, + }, + }, + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: "sm-name-scylladb" +spec: + selector: + matchExpressions: + - key: alpha + operator: Exists + values: + - beta + matchLabels: + foo: bar + jobLabel: scylla/cluster + endpoints: + - port: node-exporter + honorLabels: false + relabelings: + - sourceLabels: [__address__] + regex: '(.*):\d+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__address__] + regex: '([^:]+)' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [instance] + regex: '(.*)' + targetLabel: __address__ + replacement: '${1}:9100' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' + - port: prometheus + honorLabels: false + metricRelabelings: + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CPU + replacement: 'cpu' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: CQL + replacement: 'cql' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: OS + replacement: 'os' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: IO + replacement: 'io' + - sourceLabels: [version] + regex: '(.+)' + targetLabel: Errors + replacement: 'errors' + - regex: 'help|exported_instance' + action: labeldrop + - sourceLabels: [version] + regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*' + replacement: '$1$2' + targetLabel: svr + relabelings: + - sourceLabels: [__address__] + regex: '(.*):.+' + targetLabel: instance + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster] + regex: '(.+)' + targetLabel: cluster + replacement: '${1}' + - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter] + regex: '(.+)' + targetLabel: dc + replacement: '${1}' +`, "\n"), + expectedErr: nil, + }, + } + for _, tc := range tt { + t.Run(tc.name, func(t *testing.T) { + _, objString, err := makeScyllaDBServiceMonitor(tc.sm) + if !reflect.DeepEqual(err, tc.expectedErr) { + t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString) + } + + if objString != tc.expectedString { + t.Errorf("expected and got strings differ:\n%s", cmp.Diff( + strings.Split(tc.expectedString, "\n"), + strings.Split(objString, "\n"), + )) + } + }) + } +} + +func Test_makePrometheus(t *testing.T) { + tt := []struct { + name string + sm *scyllav1alpha1.ScyllaDBMonitoring + expectedString string + expectedErr error + }{ + { + name: "no storage", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: "sm-name" +spec: + serviceAccountName: "sm-name-prometheus" + securityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + web: + pageTitle: "ScyllaDB Prometheus" + tlsConfig: + cert: + secret: + name: "sm-name-prometheus-serving-certs" + key: "tls.crt" + keySecret: + name: "sm-name-prometheus-serving-certs" + key: "tls.key" +# clientAuthType: "RequireAndVerifyClientCert" +# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs + clientAuthType: "RequestClientCert" + client_ca: + configMap: + name: "sm-name-prometheus-client-ca" + key: "ca-bundle.crt" + httpConfig: + http2: true + serviceMonitorSelector: + matchLabels: {} + affinity: + {} + tolerations: + null + resources: + {} + alerting: + alertmanagers: + - namespace: "" + name: "sm-name" + port: web + ruleSelector: + matchLabels: + scylla-operator.scylladb.com/scylladbmonitoring-name: "sm-name" +`, "\n"), + expectedErr: nil, + }, + { + name: "with prometheus pvc template", + sm: &scyllav1alpha1.ScyllaDBMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sm-name", + }, + Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{ + Components: &scyllav1alpha1.Components{ + Prometheus: &scyllav1alpha1.PrometheusSpec{ + Storage: &scyllav1alpha1.Storage{ + VolumeClaimTemplate: corev1.PersistentVolumeClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{}, + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: pointer.String("pv-class"), + Resources: corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceStorage: resource.MustParse("5Gi"), + }, + }, + }, + }, + }, + }, + }, + }, + }, + expectedString: strings.TrimLeft(` +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: "sm-name" +spec: + serviceAccountName: "sm-name-prometheus" + securityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + web: + pageTitle: "ScyllaDB Prometheus" + tlsConfig: + cert: + secret: + name: "sm-name-prometheus-serving-certs" + key: "tls.crt" + keySecret: + name: "sm-name-prometheus-serving-certs" + key: "tls.key" +# clientAuthType: "RequireAndVerifyClientCert" +# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs + clientAuthType: "RequestClientCert" + client_ca: + configMap: + name: "sm-name-prometheus-client-ca" + key: "ca-bundle.crt" + httpConfig: + http2: true + serviceMonitorSelector: + matchLabels: {} + affinity: + {} + tolerations: + null + resources: + {} + alerting: + alertmanagers: + - namespace: "" + name: "sm-name" + port: web + ruleSelector: + matchLabels: + scylla-operator.scylladb.com/scylladbmonitoring-name: "sm-name" + storage: + volumeClaimTemplate: + metadata: + name: sm-name-prometheus + spec: + resources: + requests: + storage: 5Gi + storageClassName: pv-class + status: {} +`, "\n"), + expectedErr: nil, + }, + } + for _, tc := range tt { + t.Run(tc.name, func(t *testing.T) { + _, objString, err := makePrometheus(tc.sm) + if !reflect.DeepEqual(err, tc.expectedErr) { + t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString) + } + + if objString != tc.expectedString { + t.Errorf("expected and got strings differ:\n%s", cmp.Diff( + strings.Split(tc.expectedString, "\n"), + strings.Split(objString, "\n"), + )) + } + }) + } +} diff --git a/pkg/controllerhelpers/prune.go b/pkg/controllerhelpers/prune.go new file mode 100644 index 00000000000..8b1de3e041e --- /dev/null +++ b/pkg/controllerhelpers/prune.go @@ -0,0 +1,65 @@ +package controllerhelpers + +import ( + "context" + + "github.com/scylladb/scylla-operator/pkg/kubeinterfaces" + "github.com/scylladb/scylla-operator/pkg/resource" + "github.com/scylladb/scylla-operator/pkg/resourceapply" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" +) + +type PruneControlInterface interface { + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error +} + +type PruneControlFuncs struct { + DeleteFunc func(ctx context.Context, name string, opts metav1.DeleteOptions) error +} + +func (pcf *PruneControlFuncs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + return pcf.DeleteFunc(ctx, name, opts) +} + +var _ PruneControlInterface = &PruneControlFuncs{} + +func Prune[T kubeinterfaces.ObjectInterface](ctx context.Context, requiredObjects []T, existingObjects map[string]T, control PruneControlInterface, eventRecorder record.EventRecorder) error { + var errs []error + + for _, existing := range existingObjects { + if existing.GetDeletionTimestamp() != nil { + continue + } + + isRequired := false + for _, required := range requiredObjects { + if existing.GetName() == required.GetName() { + isRequired = true + break + } + } + if isRequired { + continue + } + + uid := existing.GetUID() + propagationPolicy := metav1.DeletePropagationBackground + klog.V(2).InfoS("Pruning resource", "GVK", resource.GetObjectGVKOrUnknown(existing), "Ref", klog.KObj(existing)) + err := control.Delete(ctx, existing.GetName(), metav1.DeleteOptions{ + Preconditions: &metav1.Preconditions{ + UID: &uid, + }, + PropagationPolicy: &propagationPolicy, + }) + resourceapply.ReportDeleteEvent(eventRecorder, existing, err) + if err != nil { + errs = append(errs, err) + continue + } + } + + return utilerrors.NewAggregate(errs) +} diff --git a/pkg/controllerhelpers/selectors.go b/pkg/controllerhelpers/selectors.go new file mode 100644 index 00000000000..e32e72ff047 --- /dev/null +++ b/pkg/controllerhelpers/selectors.go @@ -0,0 +1,18 @@ +package controllerhelpers + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" +) + +func FilterObjectMapByLabel[T metav1.Object](objects map[string]T, selector labels.Selector) map[string]T { + res := map[string]T{} + + for name, obj := range objects { + if selector.Matches(labels.Set(obj.GetLabels())) { + res[name] = obj + } + } + + return res +} diff --git a/pkg/helpers/array.go b/pkg/helpers/array.go new file mode 100644 index 00000000000..749070e821c --- /dev/null +++ b/pkg/helpers/array.go @@ -0,0 +1,40 @@ +package helpers + +func ToArray[T any](objs ...T) []T { + res := make([]T, 0, len(objs)) + return append(res, objs...) +} + +func ConvertToArray[To, From any](convert func(From) To, objs ...From) []To { + res := make([]To, 0, len(objs)) + + for i := range objs { + res = append(res, convert(objs[i])) + } + + return res +} + +func Filter[T any](array []T, filterFunc func(T) bool) []T { + res := make([]T, 0, len(array)) + + for i := range array { + if filterFunc(array[i]) { + res = append(res, array[i]) + } + } + + return res +} + +func FilterOut[T any](array []T, filterOutFunc func(T) bool) []T { + return Filter(array, func(t T) bool { + return !filterOutFunc(t) + }) +} + +func FilterOutNil[T any](array []*T) []*T { + return FilterOut[*T](array, func(item *T) bool { + return item == nil + }) +} diff --git a/pkg/kubecrypto/certmanager.go b/pkg/kubecrypto/certmanager.go index 86c21f8492c..e49772a281f 100644 --- a/pkg/kubecrypto/certmanager.go +++ b/pkg/kubecrypto/certmanager.go @@ -24,16 +24,36 @@ type MetaConfig struct { Annotations map[string]string } +func (c *MetaConfig) GetObjectMeta() *metav1.ObjectMeta { + return (&metav1.ObjectMeta{ + Name: c.Name, + Labels: c.Labels, + Annotations: c.Annotations, + }).DeepCopy() +} + type CAConfig struct { MetaConfig Validity time.Duration Refresh time.Duration } +func (c *CAConfig) GetMetaSecret() *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: *c.GetObjectMeta(), + } +} + type CABundleConfig struct { MetaConfig } +func (c *CABundleConfig) GetMetaConfigMap() *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: *c.GetObjectMeta(), + } +} + type CertificateConfig struct { MetaConfig Validity time.Duration @@ -41,6 +61,57 @@ type CertificateConfig struct { CertCreator ocrypto.CertCreator } +func (c *CertificateConfig) GetMetaSecret() *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: *c.GetObjectMeta(), + } +} + +type CertChainConfig struct { + CAConfig *CAConfig + CABundleConfig *CABundleConfig + CertConfigs []*CertificateConfig +} + +func (c *CertChainConfig) GetMetaSecrets() []*corev1.Secret { + secrets := make([]*corev1.Secret, 0, len(c.CertConfigs)+1) + secrets = append(secrets, c.CAConfig.GetMetaSecret()) + + for _, cc := range c.CertConfigs { + secrets = append(secrets, cc.GetMetaSecret()) + } + + return secrets +} + +func (c *CertChainConfig) GetMetaConfigMaps() []*corev1.ConfigMap { + return []*corev1.ConfigMap{ + c.CABundleConfig.GetMetaConfigMap(), + } +} + +type CertChainConfigs []*CertChainConfig + +func (configs CertChainConfigs) GetMetaSecrets() []*corev1.Secret { + secrets := make([]*corev1.Secret, 0, len(configs)*2) + + for _, c := range configs { + secrets = append(secrets, c.GetMetaSecrets()...) + } + + return secrets +} + +func (configs CertChainConfigs) GetMetaConfigMaps() []*corev1.ConfigMap { + configMaps := make([]*corev1.ConfigMap, 0, len(configs)*2) + + for _, c := range configs { + configMaps = append(configMaps, c.GetMetaConfigMaps()...) + } + + return configMaps +} + type CertificateManager struct { secretsClient corev1client.SecretsGetter secretLister corev1listers.SecretLister @@ -121,3 +192,7 @@ func (cm *CertificateManager) ManageCertificates(ctx context.Context, nowFunc fu return nil } + +func (cm *CertificateManager) ManageCertificateChain(ctx context.Context, nowFunc func() time.Time, controller *metav1.ObjectMeta, controllerGVK schema.GroupVersionKind, certChainConfig *CertChainConfig, existingSecrets map[string]*corev1.Secret, existingConfigMaps map[string]*corev1.ConfigMap) error { + return cm.ManageCertificates(ctx, nowFunc, controller, controllerGVK, certChainConfig.CAConfig, certChainConfig.CABundleConfig, certChainConfig.CertConfigs, existingSecrets, existingConfigMaps) +} diff --git a/pkg/naming/constants.go b/pkg/naming/constants.go index b7e04352aba..3c6a216e5af 100644 --- a/pkg/naming/constants.go +++ b/pkg/naming/constants.go @@ -62,6 +62,8 @@ const ( NodeConfigNameLabel = "scylla-operator.scylladb.com/node-config-name" ConfigMapTypeLabel = "scylla-operator.scylladb.com/config-map-type" OwnerUIDLabel = "scylla-operator.scylladb.com/owner-uid" + ScyllaDBMonitoringNameLabel = "scylla-operator.scylladb.com/scylladbmonitoring-name" + ControllerNameLabel = "scylla-operator.scylladb.com/controller-name" AppName = "scylla" OperatorAppName = "scylla-operator" diff --git a/pkg/resourceapply/apps.go b/pkg/resourceapply/apps.go index 430ac03748a..c7f0f553999 100644 --- a/pkg/resourceapply/apps.go +++ b/pkg/resourceapply/apps.go @@ -72,3 +72,35 @@ func ApplyDaemonSet( options, ) } + +func ApplyDeploymentWithControl( + ctx context.Context, + control ApplyControlInterface[*appsv1.Deployment], + recorder record.EventRecorder, + required *appsv1.Deployment, + options ApplyOptions, +) (*appsv1.Deployment, bool, error) { + return ApplyGeneric[*appsv1.Deployment](ctx, control, recorder, required, options) +} + +func ApplyDeployment( + ctx context.Context, + client appsv1client.DeploymentsGetter, + lister appsv1listers.DeploymentLister, + recorder record.EventRecorder, + required *appsv1.Deployment, + options ApplyOptions, +) (*appsv1.Deployment, bool, error) { + return ApplyDeploymentWithControl( + ctx, + ApplyControlFuncs[*appsv1.Deployment]{ + GetCachedFunc: lister.Deployments(required.Namespace).Get, + CreateFunc: client.Deployments(required.Namespace).Create, + UpdateFunc: client.Deployments(required.Namespace).Update, + DeleteFunc: client.Deployments(required.Namespace).Delete, + }, + recorder, + required, + options, + ) +} diff --git a/pkg/resourceapply/generic.go b/pkg/resourceapply/generic.go new file mode 100644 index 00000000000..74e5161cf70 --- /dev/null +++ b/pkg/resourceapply/generic.go @@ -0,0 +1,152 @@ +package resourceapply + +import ( + "context" + "fmt" + + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + "github.com/scylladb/scylla-operator/pkg/kubeinterfaces" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" +) + +type ApplyConfigUntyped struct { + Required kubeinterfaces.ObjectInterface + Options ApplyOptions + Control ApplyControlUntypedInterface +} + +type ApplyConfig[T kubeinterfaces.ObjectInterface] struct { + Required T + Options ApplyOptions + Control ApplyControlFuncs[T] +} + +func (ac ApplyConfig[T]) ToUntyped() ApplyConfigUntyped { + return ApplyConfigUntyped{ + Required: ac.Required, + Options: ac.Options, + Control: ac.Control.ToUntyped(), + } +} + +func ApplyFromConfig( + ctx context.Context, + cfg ApplyConfigUntyped, + recorder record.EventRecorder, +) (kubeinterfaces.ObjectInterface, bool, error) { + return Apply( + ctx, + cfg.Required, + cfg.Control, + cfg.Options, + recorder, + ) +} + +func Apply( + ctx context.Context, + required kubeinterfaces.ObjectInterface, + control ApplyControlUntypedInterface, + options ApplyOptions, + recorder record.EventRecorder, +) (kubeinterfaces.ObjectInterface, bool, error) { + switch metav1.Object(required).(type) { + case *corev1.Service: + return ApplyServiceWithControl( + ctx, + TypeApplyControlInterface[*corev1.Service](control), + recorder, + required.(*corev1.Service), + options, + ) + + case *corev1.ConfigMap: + return ApplyConfigMapWithControl( + ctx, + TypeApplyControlInterface[*corev1.ConfigMap](control), + recorder, + required.(*corev1.ConfigMap), + options, + ) + + case *corev1.Secret: + return ApplySecretWithControl( + ctx, + TypeApplyControlInterface[*corev1.Secret](control), + recorder, + required.(*corev1.Secret), + options, + ) + + case *corev1.ServiceAccount: + return ApplyServiceAccountWithControl( + ctx, + TypeApplyControlInterface[*corev1.ServiceAccount](control), + recorder, + required.(*corev1.ServiceAccount), + options, + ) + + case *rbacv1.RoleBinding: + return ApplyRoleBindingWithControl( + ctx, + TypeApplyControlInterface[*rbacv1.RoleBinding](control), + recorder, + required.(*rbacv1.RoleBinding), + options, + ) + + case *appsv1.Deployment: + return ApplyDeploymentWithControl( + ctx, + TypeApplyControlInterface[*appsv1.Deployment](control), + recorder, + required.(*appsv1.Deployment), + options, + ) + + case *networkingv1.Ingress: + return ApplyIngressWithControl( + ctx, + TypeApplyControlInterface[*networkingv1.Ingress](control), + recorder, + required.(*networkingv1.Ingress), + options, + ) + + case *monitoringv1.Prometheus: + return ApplyPrometheusWithControl( + ctx, + TypeApplyControlInterface[*monitoringv1.Prometheus](control), + recorder, + required.(*monitoringv1.Prometheus), + options, + ) + + case *monitoringv1.PrometheusRule: + return ApplyPrometheusRuleWithControl( + ctx, + TypeApplyControlInterface[*monitoringv1.PrometheusRule](control), + recorder, + required.(*monitoringv1.PrometheusRule), + options, + ) + + case *monitoringv1.ServiceMonitor: + return ApplyServiceMonitorWithControl( + ctx, + TypeApplyControlInterface[*monitoringv1.ServiceMonitor](control), + recorder, + required.(*monitoringv1.ServiceMonitor), + options, + ) + + default: + return nil, false, fmt.Errorf("no apply method matched for type %T", required) + } +} diff --git a/pkg/resourceapply/helpers.go b/pkg/resourceapply/helpers.go index ddbe762eb0f..93e513cf255 100644 --- a/pkg/resourceapply/helpers.go +++ b/pkg/resourceapply/helpers.go @@ -203,6 +203,35 @@ func (acf ApplyControlFuncs[T]) ToUntyped() ApplyControlUntypedFuncs { var _ ApplyControlInterface[*corev1.Service] = ApplyControlFuncs[*corev1.Service]{} +func TypeApplyControlInterface[T kubeinterfaces.ObjectInterface](untyped ApplyControlUntypedInterface) ApplyControlInterface[T] { + return ApplyControlFuncs[T]{ + GetCachedFunc: func(name string) (T, error) { + res, err := untyped.GetCached(name) + if res == nil { + return *new(T), err + } + return res.(T), err + }, + CreateFunc: func(ctx context.Context, obj T, opts metav1.CreateOptions) (T, error) { + res, err := untyped.Create(ctx, obj, opts) + if res == nil { + return *new(T), err + } + return res.(T), err + }, + UpdateFunc: func(ctx context.Context, obj T, opts metav1.UpdateOptions) (T, error) { + res, err := untyped.Update(ctx, obj, opts) + if res == nil { + return *new(T), err + } + return res.(T), err + }, + DeleteFunc: func(ctx context.Context, name string, opts metav1.DeleteOptions) error { + return untyped.Delete(ctx, name, opts) + }, + } +} + type ApplyOptions struct { ForceOwnership bool AllowMissingControllerRef bool diff --git a/pkg/resourceapply/monitoring.go b/pkg/resourceapply/monitoring.go new file mode 100644 index 00000000000..c1547de2659 --- /dev/null +++ b/pkg/resourceapply/monitoring.go @@ -0,0 +1,106 @@ +package resourceapply + +import ( + "context" + + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" + monitoringv1client "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned/typed/monitoring/v1" + monitoringv1listers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/listers/monitoring/v1" + "k8s.io/client-go/tools/record" +) + +func ApplyPrometheusWithControl( + ctx context.Context, + control ApplyControlInterface[*monitoringv1.Prometheus], + recorder record.EventRecorder, + required *monitoringv1.Prometheus, + options ApplyOptions, +) (*monitoringv1.Prometheus, bool, error) { + return ApplyGeneric[*monitoringv1.Prometheus](ctx, control, recorder, required, options) +} + +func ApplyPrometheus( + ctx context.Context, + client monitoringv1client.PrometheusesGetter, + lister monitoringv1listers.PrometheusLister, + recorder record.EventRecorder, + required *monitoringv1.Prometheus, + options ApplyOptions, +) (*monitoringv1.Prometheus, bool, error) { + return ApplyPrometheusWithControl( + ctx, + ApplyControlFuncs[*monitoringv1.Prometheus]{ + GetCachedFunc: lister.Prometheuses(required.Namespace).Get, + CreateFunc: client.Prometheuses(required.Namespace).Create, + UpdateFunc: client.Prometheuses(required.Namespace).Update, + DeleteFunc: client.Prometheuses(required.Namespace).Delete, + }, + recorder, + required, + options, + ) +} + +func ApplyPrometheusRuleWithControl( + ctx context.Context, + control ApplyControlInterface[*monitoringv1.PrometheusRule], + recorder record.EventRecorder, + required *monitoringv1.PrometheusRule, + options ApplyOptions, +) (*monitoringv1.PrometheusRule, bool, error) { + return ApplyGeneric[*monitoringv1.PrometheusRule](ctx, control, recorder, required, options) +} + +func ApplyPrometheusRule( + ctx context.Context, + client monitoringv1client.PrometheusRulesGetter, + lister monitoringv1listers.PrometheusRuleLister, + recorder record.EventRecorder, + required *monitoringv1.PrometheusRule, + options ApplyOptions, +) (*monitoringv1.PrometheusRule, bool, error) { + return ApplyPrometheusRuleWithControl( + ctx, + ApplyControlFuncs[*monitoringv1.PrometheusRule]{ + GetCachedFunc: lister.PrometheusRules(required.Namespace).Get, + CreateFunc: client.PrometheusRules(required.Namespace).Create, + UpdateFunc: client.PrometheusRules(required.Namespace).Update, + DeleteFunc: client.PrometheusRules(required.Namespace).Delete, + }, + recorder, + required, + options, + ) +} + +func ApplyServiceMonitorWithControl( + ctx context.Context, + control ApplyControlInterface[*monitoringv1.ServiceMonitor], + recorder record.EventRecorder, + required *monitoringv1.ServiceMonitor, + options ApplyOptions, +) (*monitoringv1.ServiceMonitor, bool, error) { + return ApplyGeneric[*monitoringv1.ServiceMonitor](ctx, control, recorder, required, options) +} + +func ApplyServiceMonitor( + ctx context.Context, + client monitoringv1client.ServiceMonitorsGetter, + lister monitoringv1listers.ServiceMonitorLister, + recorder record.EventRecorder, + required *monitoringv1.ServiceMonitor, + options ApplyOptions, +) (*monitoringv1.ServiceMonitor, bool, error) { + return ApplyServiceMonitorWithControl( + ctx, + ApplyControlFuncs[*monitoringv1.ServiceMonitor]{ + GetCachedFunc: lister.ServiceMonitors(required.Namespace).Get, + CreateFunc: client.ServiceMonitors(required.Namespace).Create, + UpdateFunc: client.ServiceMonitors(required.Namespace).Update, + DeleteFunc: client.ServiceMonitors(required.Namespace).Delete, + }, + recorder, + required, + options, + ) +} diff --git a/pkg/scheme/scheme.go b/pkg/scheme/scheme.go index 1c28a0240fe..35cae5f000b 100644 --- a/pkg/scheme/scheme.go +++ b/pkg/scheme/scheme.go @@ -2,6 +2,8 @@ package scheme import ( scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1" + scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1" cqlclientv1alpha1 "github.com/scylladb/scylla-operator/pkg/scylla/api/cqlclient/v1alpha1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" @@ -29,5 +31,9 @@ func init() { utilruntime.Must(kscheme.AddToScheme(Scheme)) utilruntime.Must(scyllav1.Install(Scheme)) + utilruntime.Must(scyllav1alpha1.Install(Scheme)) + utilruntime.Must(cqlclientv1alpha1.Install(Scheme)) + + utilruntime.Must(monitoringv1.Install(Scheme)) } diff --git a/test/e2e/fixture/scylla/registry.go b/test/e2e/fixture/scylla/registry.go index 24de5e4a1ba..6b618e8db9e 100644 --- a/test/e2e/fixture/scylla/registry.go +++ b/test/e2e/fixture/scylla/registry.go @@ -6,15 +6,25 @@ import ( o "github.com/onsi/gomega" scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1" scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1" + "github.com/scylladb/scylla-operator/pkg/assets" "github.com/scylladb/scylla-operator/test/e2e/scheme" + "k8s.io/apimachinery/pkg/runtime" ) +func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] { + return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer()) +} + var ( //go:embed "basic.scyllacluster.yaml" BasicScyllaCluster ScyllaClusterBytes //go:embed "nodeconfig.yaml" NodeConfig NodeConfigBytes + + //go:embed "scylladbmonitoring.yaml.tmpl" + scyllaDBMonitoringTemplateString string + ScyllaDBMonitoringTemplate = ParseObjectTemplateOrDie[*scyllav1alpha1.ScyllaDBMonitoring]("scylladbmonitoring", scyllaDBMonitoringTemplateString) ) type ScyllaClusterBytes []byte diff --git a/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl b/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl new file mode 100644 index 00000000000..9a33d85b70d --- /dev/null +++ b/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl @@ -0,0 +1,35 @@ +apiVersion: scylla.scylladb.com/v1alpha1 +kind: ScyllaDBMonitoring +metadata: + name: "{{ .name }}" +spec: + endpointsSelector: + matchLabels: + app.kubernetes.io/name: scylla + scylla-operator.scylladb.com/scylla-service-type: identity + scylla/cluster: "{{ .scyllaClusterName }}" + components: + prometheus: + exposeOptions: + webInterface: + ingress: + ingressClassName: haproxy + dnsDomains: + - "{{ .name }}-prometheus.{{ .namespace }}.apps.cluster.scylladb.com" + annotations: + haproxy-ingress.github.io/ssl-passthrough: "true" + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 1Gi + grafana: + exposeOptions: + webInterface: + ingress: + ingressClassName: haproxy + dnsDomains: + - "{{ .name }}-grafana.{{ .namespace }}.apps.cluster.scylladb.com" + annotations: + haproxy-ingress.github.io/ssl-passthrough: "true" diff --git a/test/e2e/include.go b/test/e2e/include.go index fd455857bf5..c57bae5ed30 100644 --- a/test/e2e/include.go +++ b/test/e2e/include.go @@ -5,4 +5,5 @@ package e2e import ( _ "github.com/scylladb/scylla-operator/test/e2e/set/nodeconfig" _ "github.com/scylladb/scylla-operator/test/e2e/set/scyllacluster" + _ "github.com/scylladb/scylla-operator/test/e2e/set/scylladbmonitoring" ) diff --git a/test/e2e/set/scyllacluster/scyllacluster_tls.go b/test/e2e/set/scyllacluster/scyllacluster_tls.go index 0087ba40536..f5ec7871491 100644 --- a/test/e2e/set/scyllacluster/scyllacluster_tls.go +++ b/test/e2e/set/scyllacluster/scyllacluster_tls.go @@ -23,6 +23,7 @@ import ( "github.com/scylladb/scylla-operator/test/e2e/framework" "github.com/scylladb/scylla-operator/test/e2e/scheme" "github.com/scylladb/scylla-operator/test/e2e/utils" + "github.com/scylladb/scylla-operator/test/e2e/verification" corev1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -128,36 +129,36 @@ var _ = g.Describe("ScyllaCluster", func() { clientCASecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-client-ca", sc.Name), metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - clientCACerts, _, _, _ := verifyAndParseTLSCert(clientCASecret, verifyTLSCertOptions{ - isCA: pointer.Bool(true), - keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign), + clientCACerts, _, _, _ := verification.VerifyAndParseTLSCert(clientCASecret, verification.TLSCertOptions{ + IsCA: pointer.Bool(true), + KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign), }) o.Expect(clientCACerts).To(o.HaveLen(1)) servingCASecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-ca", sc.Name), metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - _, _, _, _ = verifyAndParseTLSCert(servingCASecret, verifyTLSCertOptions{ - isCA: pointer.Bool(true), - keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign), + _, _, _, _ = verification.VerifyAndParseTLSCert(servingCASecret, verification.TLSCertOptions{ + IsCA: pointer.Bool(true), + KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign), }) servingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-ca", sc.Name), metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - servingCACerts, servingCACertBytes := verifyAndParseCABundle(servingCABundleConfigMap) + servingCACerts, servingCACertBytes := verification.VerifyAndParseCABundle(servingCABundleConfigMap) o.Expect(servingCACerts).To(o.HaveLen(1)) servingCertSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-certs", sc.Name), metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - servingCerts, _, _, _ := verifyAndParseTLSCert(servingCertSecret, verifyTLSCertOptions{ - isCA: pointer.Bool(false), - keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature), + servingCerts, _, _, _ := verification.VerifyAndParseTLSCert(servingCertSecret, verification.TLSCertOptions{ + IsCA: pointer.Bool(false), + KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature), }) adminClientSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-user-admin", sc.Name), metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - _, adminClientCertBytes, _, adminClientKeyBytes := verifyAndParseTLSCert(adminClientSecret, verifyTLSCertOptions{ - isCA: pointer.Bool(false), - keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature), + _, adminClientCertBytes, _, adminClientKeyBytes := verification.VerifyAndParseTLSCert(adminClientSecret, verification.TLSCertOptions{ + IsCA: pointer.Bool(false), + KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature), }) adminClientConnectionConfigsSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-cql-connection-configs-admin", sc.Name), metav1.GetOptions{}) diff --git a/test/e2e/set/scyllacluster/verify.go b/test/e2e/set/scyllacluster/verify.go index 78ab5e9d17f..a6fe8437099 100644 --- a/test/e2e/set/scyllacluster/verify.go +++ b/test/e2e/set/scyllacluster/verify.go @@ -2,14 +2,11 @@ package scyllacluster import ( "context" - "crypto" - "crypto/x509" "sort" "strings" o "github.com/onsi/gomega" scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1" - ocrypto "github.com/scylladb/scylla-operator/pkg/crypto" "github.com/scylladb/scylla-operator/pkg/features" "github.com/scylladb/scylla-operator/pkg/naming" cqlclientv1alpha1 "github.com/scylladb/scylla-operator/pkg/scylla/api/cqlclient/v1alpha1" @@ -286,45 +283,6 @@ func insertAndVerifyCQLData(ctx context.Context, hosts []string) *utils.DataInse return di } -type verifyTLSCertOptions struct { - isCA *bool - keyUsage *x509.KeyUsage -} - -func verifyAndParseTLSCert(secret *corev1.Secret, options verifyTLSCertOptions) ([]*x509.Certificate, []byte, crypto.PrivateKey, []byte) { - o.Expect(secret.Type).To(o.Equal(corev1.SecretType("kubernetes.io/tls"))) - o.Expect(secret.Data).To(o.HaveKey("tls.crt")) - o.Expect(secret.Data).To(o.HaveKey("tls.key")) - - certsBytes := secret.Data["tls.crt"] - keyBytes := secret.Data["tls.key"] - o.Expect(certsBytes).NotTo(o.BeEmpty()) - o.Expect(keyBytes).NotTo(o.BeEmpty()) - - certs, key, err := ocrypto.GetTLSCertificatesFromBytes(certsBytes, keyBytes) - o.Expect(err).NotTo(o.HaveOccurred()) - - o.Expect(certs).NotTo(o.BeEmpty()) - o.Expect(certs[0].IsCA).To(o.Equal(*options.isCA)) - o.Expect(certs[0].KeyUsage).To(o.Equal(*options.keyUsage)) - - o.Expect(key.Validate()).To(o.Succeed()) - - return certs, certsBytes, key, keyBytes -} - -func verifyAndParseCABundle(cm *corev1.ConfigMap) ([]*x509.Certificate, []byte) { - o.Expect(cm.Data).To(o.HaveKey("ca-bundle.crt")) - - bundleBytes := cm.Data["ca-bundle.crt"] - o.Expect(bundleBytes).NotTo(o.BeEmpty()) - - certs, err := ocrypto.DecodeCertificates([]byte(bundleBytes)) - o.Expect(err).NotTo(o.HaveOccurred()) - - return certs, []byte(bundleBytes) -} - type verifyCQLConnectionConfigsOptions struct { domains []string datacenters []string diff --git a/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go b/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go new file mode 100644 index 00000000000..48bc61ff071 --- /dev/null +++ b/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go @@ -0,0 +1,264 @@ +// Copyright (C) 2022 ScyllaDB + +package scylladbmonitoring + +import ( + "context" + "crypto/tls" + "crypto/x509" + "fmt" + "net" + "net/http" + "net/url" + "time" + + gapi "github.com/grafana/grafana-api-golang-client" + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + prometheusappclient "github.com/prometheus/client_golang/api" + promeheusappv1api "github.com/prometheus/client_golang/api/prometheus/v1" + opointer "github.com/scylladb/scylla-operator/pkg/pointer" + scyllafixture "github.com/scylladb/scylla-operator/test/e2e/fixture/scylla" + "github.com/scylladb/scylla-operator/test/e2e/framework" + "github.com/scylladb/scylla-operator/test/e2e/utils" + "github.com/scylladb/scylla-operator/test/e2e/verification" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +var _ = g.Describe("ScyllaDBMonitoring", func() { + defer g.GinkgoRecover() + + f := framework.NewFramework("scylladbmonitoring") + + g.It("should setup monitoring stack", func() { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer cancel() + + sc := scyllafixture.BasicScyllaCluster.ReadOrFail() + o.Expect(sc.Spec.Datacenter.Racks).To(o.HaveLen(1)) + sc.Spec.Datacenter.Racks[0].Members = 1 + + framework.By("Creating a ScyllaCluster with a single node") + sc, err := f.ScyllaClient().ScyllaV1().ScyllaClusters(f.Namespace()).Create( + ctx, + sc, + metav1.CreateOptions{ + FieldManager: f.FieldManager(), + FieldValidation: metav1.FieldValidationStrict, + }, + ) + o.Expect(err).NotTo(o.HaveOccurred()) + + framework.By("Creating a ScyllaDBMonitoring") + sm, _, err := scyllafixture.ScyllaDBMonitoringTemplate.RenderObject(map[string]string{ + "name": sc.Name, + "namespace": sc.Namespace, + "scyllaClusterName": sc.Name, + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + sm, err = f.ScyllaClient().ScyllaV1alpha1().ScyllaDBMonitorings(sc.Namespace).Create( + ctx, + sm, + metav1.CreateOptions{ + FieldManager: f.FieldManager(), + FieldValidation: metav1.FieldValidationStrict, + }, + ) + o.Expect(err).NotTo(o.HaveOccurred()) + + framework.By("Waiting for the ScyllaCluster to rollout (RV=%s)", sc.ResourceVersion) + waitCtx1, waitCtx1Cancel := utils.ContextForRollout(ctx, sc) + defer waitCtx1Cancel() + sc, err = utils.WaitForScyllaClusterState(waitCtx1, f.ScyllaClient().ScyllaV1(), sc.Namespace, sc.Name, utils.WaitForStateOptions{}, utils.IsScyllaClusterRolledOut) + o.Expect(err).NotTo(o.HaveOccurred()) + + framework.By("Waiting for the ScyllaDBMonitoring to rollout (RV=%s)", sm.ResourceVersion) + waitCtx2, waitCtx2Cancel := context.WithTimeout(ctx, 5*time.Minute) + defer waitCtx2Cancel() + sm, err = utils.WaitForScyllaDBMonitoringState(waitCtx2, f.ScyllaClient().ScyllaV1alpha1().ScyllaDBMonitorings(sc.Namespace), sc.Name, utils.WaitForStateOptions{}, utils.IsScyllaDBMonitoringRolledOut) + o.Expect(err).NotTo(o.HaveOccurred()) + + // We need to retry the prometheus and grafana assertion for several reasons, some of them are: + // - ingress exposure is asynchronous and some controllers don't report back status to wait for + // - prometheus configuration is asynchronous without any acknowledgement + // - grafana configuration is asynchronous without any acknowledgement + // Some of these may be fixable by manually verifying it in the operator sync loop so it can also be + // consumed by clients, but it's a bigger effort. + + framework.By("Verifying that Prometheus is configured correctly") + + prometheusServingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-prometheus-serving-ca", sm.Name), metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + prometheusServingCACerts, _ := verification.VerifyAndParseCABundle(prometheusServingCABundleConfigMap) + o.Expect(prometheusServingCACerts).To(o.HaveLen(1)) + + prometheusServingCAPool := x509.NewCertPool() + prometheusServingCAPool.AddCert(prometheusServingCACerts[0]) + + prometheusGrafanaClientSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-prometheus-client-grafana", sm.Name), metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + _, prometheusGrafanaClientCertBytes, _, prometheusGrafanaClientKeyBytes := verification.VerifyAndParseTLSCert(prometheusGrafanaClientSecret, verification.TLSCertOptions{ + IsCA: pointer.Bool(false), + KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature), + }) + + prometheusGrafanaAdminTLSCert, err := tls.X509KeyPair(prometheusGrafanaClientCertBytes, prometheusGrafanaClientKeyBytes) + o.Expect(err).NotTo(o.HaveOccurred()) + + o.Expect(sm.Spec.Components.Prometheus.ExposeOptions.WebInterface.Ingress.DNSDomains).To(o.HaveLen(1)) + prometheusServerName := sm.Spec.Components.Prometheus.ExposeOptions.WebInterface.Ingress.DNSDomains[0] + + promHTTPClient, err := prometheusappclient.NewClient(prometheusappclient.Config{ + Address: "https://" + f.GetIngressAddress(prometheusServerName), + Client: &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + ServerName: prometheusServerName, + Certificates: []tls.Certificate{prometheusGrafanaAdminTLSCert}, + RootCAs: prometheusServingCAPool, + }, + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + ForceAttemptHTTP2: true, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }, + }, + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + promClient := promeheusappv1api.NewAPI(promHTTPClient) + + o.Eventually(func(eo o.Gomega) { + ctxTargets, ctxTargetsCancel := context.WithTimeout(ctx, 15*time.Second) + defer ctxTargetsCancel() + + targets, err := promClient.Targets(ctxTargets) + framework.Infof("Listing grafana targets: err: %v, active: %d, dropped: %d", err, len(targets.Active), len(targets.Dropped)) + eo.Expect(err).NotTo(o.HaveOccurred()) + + // This should match the number of rules in service monitors used. We can possibly extend this to compare those + // or wait to be able to assess that dropped targets are empty. + eo.Expect(targets.Active).To(o.HaveLen(2)) + for _, t := range targets.Active { + eo.Expect(t.Health).To(o.Equal(promeheusappv1api.HealthGood)) + } + + // TODO: There shouldn't be any dropped targets. Currently, /service-discovery contains + // "undefined (0 / 54 active targets)" that are in addition to our ServiceMonitor definition. + // (Maciek was looking into this, it seems to be a bug in prometheus operator.) + // o.Expect(targets.Dropped).To(o.HaveLen(0)) + + rulesResult, err := promClient.Rules(ctxTargets) + framework.Infof("Listing grafana rules: err: %v, groupCount: %d", err, len(rulesResult.Groups)) + eo.Expect(err).NotTo(o.HaveOccurred()) + + o.Expect(rulesResult.Groups).NotTo(o.HaveLen(0)) + o.Expect(rulesResult.Groups[0].Name).To(o.Equal("scylla.rules")) + o.Expect(rulesResult.Groups[0].Rules).NotTo(o.BeEmpty()) + for _, rule := range rulesResult.Groups[0].Rules { + switch rule.(type) { + case promeheusappv1api.AlertingRule: + o.Expect(rule.(promeheusappv1api.AlertingRule).Health).To(o.BeEquivalentTo(promeheusappv1api.RuleHealthGood)) + + case promeheusappv1api.RecordingRule: + o.Expect(rule.(promeheusappv1api.RecordingRule).Health).To(o.BeEquivalentTo(promeheusappv1api.RuleHealthGood)) + + default: + o.Expect(fmt.Errorf("unexpected rule type %t", rule)).NotTo(o.HaveOccurred()) + } + } + + }).WithTimeout(5 * time.Minute).WithPolling(1 * time.Second).Should(o.Succeed()) + + framework.By("Verifying that Grafana is configured correctly") + + grafanaAdminCredentialsSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-grafana-admin-credentials", sc.Name), metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveLen(2)) + o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveKey("username")) + o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveKey("password")) + + grafanaUsername := string(grafanaAdminCredentialsSecret.Data["username"]) + o.Expect(grafanaUsername).NotTo(o.BeEmpty()) + grafanaPassword := string(grafanaAdminCredentialsSecret.Data["password"]) + o.Expect(grafanaPassword).NotTo(o.BeEmpty()) + + grafanaServingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-grafana-serving-ca", sc.Name), metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + grafanaServingCACerts, _ := verification.VerifyAndParseCABundle(grafanaServingCABundleConfigMap) + o.Expect(grafanaServingCACerts).To(o.HaveLen(1)) + + grafanaServingCAPool := x509.NewCertPool() + grafanaServingCAPool.AddCert(grafanaServingCACerts[0]) + + o.Expect(sm.Spec.Components.Grafana.ExposeOptions.WebInterface.Ingress.DNSDomains).To(o.HaveLen(1)) + grafanaServerName := sm.Spec.Components.Grafana.ExposeOptions.WebInterface.Ingress.DNSDomains[0] + + grafanaClient, err := gapi.New( + "https://"+f.GetIngressAddress(grafanaServerName), + gapi.Config{ + BasicAuth: url.UserPassword(grafanaUsername, grafanaPassword), + Client: &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + ServerName: grafanaServerName, + RootCAs: grafanaServingCAPool, + }, + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + ForceAttemptHTTP2: true, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }, + Timeout: 15 * time.Second, + }, + }, + ) + o.Expect(err).NotTo(o.HaveOccurred()) + + expectedDashboards := []gapi.FolderDashboardSearchResponse{ + { + ID: 2, + Title: "CQL Overview", + URI: "db/cql-overview", + Slug: "", + Type: "dash-db", + Tags: []string{}, + IsStarred: false, + FolderID: 1, + FolderTitle: "scylladb", + }, + } + + var dashboards []gapi.FolderDashboardSearchResponse + o.Eventually(func(eo o.Gomega) { + dashboards, err = grafanaClient.Dashboards() + framework.Infof("Listing grafana dashboards: err: %v, count: %d", err, len(dashboards)) + eo.Expect(err).NotTo(o.HaveOccurred()) + eo.Expect(dashboards).To(o.HaveLen(len(expectedDashboards))) + }).WithTimeout(10 * time.Minute).WithPolling(1 * time.Second).Should(o.Succeed()) + + // Clear random fields for comparison. + for i := range dashboards { + d := &dashboards[i] + d.UID = "" + d.URL = "" + d.FolderUID = "" + d.FolderURL = "" + } + o.Expect(dashboards).To(o.Equal(expectedDashboards)) + }) +}) diff --git a/test/e2e/utils/helpers.go b/test/e2e/utils/helpers.go index df5ac4fddc2..8c4dbec4fb5 100644 --- a/test/e2e/utils/helpers.go +++ b/test/e2e/utils/helpers.go @@ -130,6 +130,24 @@ func IsScyllaClusterRolledOut(sc *scyllav1.ScyllaCluster) (bool, error) { return true, nil } +func IsScyllaDBMonitoringRolledOut(sm *scyllav1alpha1.ScyllaDBMonitoring) (bool, error) { + if !helpers.IsStatusConditionPresentAndTrue(sm.Status.Conditions, scyllav1alpha1.AvailableCondition, sm.Generation) { + return false, nil + } + + if !helpers.IsStatusConditionPresentAndFalse(sm.Status.Conditions, scyllav1alpha1.ProgressingCondition, sm.Generation) { + return false, nil + } + + if !helpers.IsStatusConditionPresentAndFalse(sm.Status.Conditions, scyllav1alpha1.DegradedCondition, sm.Generation) { + return false, nil + } + + framework.Infof("ScyllaDBMonitoring %s (RV=%s) is rolled out", klog.KObj(sm), sm.ResourceVersion) + + return true, nil +} + type listerWatcher[ListObject runtime.Object] interface { List(context.Context, metav1.ListOptions) (ListObject, error) Watch(context.Context, metav1.ListOptions) (watch.Interface, error) @@ -200,6 +218,10 @@ func WaitForScyllaClusterState(ctx context.Context, client scyllav1client.Scylla return WaitForObjectState[*scyllav1.ScyllaCluster, *scyllav1.ScyllaClusterList](ctx, client.ScyllaClusters(namespace), name, options, condition, additionalConditions...) } +func WaitForScyllaDBMonitoringState(ctx context.Context, client scyllav1alpha1client.ScyllaDBMonitoringInterface, name string, options WaitForStateOptions, condition func(monitoring *scyllav1alpha1.ScyllaDBMonitoring) (bool, error), additionalConditions ...func(monitoring *scyllav1alpha1.ScyllaDBMonitoring) (bool, error)) (*scyllav1alpha1.ScyllaDBMonitoring, error) { + return WaitForObjectState[*scyllav1alpha1.ScyllaDBMonitoring, *scyllav1alpha1.ScyllaDBMonitoringList](ctx, client, name, options, condition, additionalConditions...) +} + func WaitForPodState(ctx context.Context, client corev1client.PodInterface, name string, options WaitForStateOptions, condition func(*corev1.Pod) (bool, error), additionalConditions ...func(*corev1.Pod) (bool, error)) (*corev1.Pod, error) { return WaitForObjectState[*corev1.Pod, *corev1.PodList](ctx, client, name, options, condition, additionalConditions...) } diff --git a/test/e2e/verification/certs.go b/test/e2e/verification/certs.go new file mode 100644 index 00000000000..5df9ad35c03 --- /dev/null +++ b/test/e2e/verification/certs.go @@ -0,0 +1,49 @@ +package verification + +import ( + "crypto" + "crypto/x509" + + o "github.com/onsi/gomega" + ocrypto "github.com/scylladb/scylla-operator/pkg/crypto" + corev1 "k8s.io/api/core/v1" +) + +type TLSCertOptions struct { + IsCA *bool + KeyUsage *x509.KeyUsage +} + +func VerifyAndParseTLSCert(secret *corev1.Secret, options TLSCertOptions) ([]*x509.Certificate, []byte, crypto.PrivateKey, []byte) { + o.Expect(secret.Type).To(o.Equal(corev1.SecretType("kubernetes.io/tls"))) + o.Expect(secret.Data).To(o.HaveKey("tls.crt")) + o.Expect(secret.Data).To(o.HaveKey("tls.key")) + + certsBytes := secret.Data["tls.crt"] + keyBytes := secret.Data["tls.key"] + o.Expect(certsBytes).NotTo(o.BeEmpty()) + o.Expect(keyBytes).NotTo(o.BeEmpty()) + + certs, key, err := ocrypto.GetTLSCertificatesFromBytes(certsBytes, keyBytes) + o.Expect(err).NotTo(o.HaveOccurred()) + + o.Expect(certs).NotTo(o.BeEmpty()) + o.Expect(certs[0].IsCA).To(o.Equal(*options.IsCA)) + o.Expect(certs[0].KeyUsage).To(o.Equal(*options.KeyUsage)) + + o.Expect(key.Validate()).To(o.Succeed()) + + return certs, certsBytes, key, keyBytes +} + +func VerifyAndParseCABundle(cm *corev1.ConfigMap) ([]*x509.Certificate, []byte) { + o.Expect(cm.Data).To(o.HaveKey("ca-bundle.crt")) + + bundleBytes := cm.Data["ca-bundle.crt"] + o.Expect(bundleBytes).NotTo(o.BeEmpty()) + + certs, err := ocrypto.DecodeCertificates([]byte(bundleBytes)) + o.Expect(err).NotTo(o.HaveOccurred()) + + return certs, []byte(bundleBytes) +}