diff --git a/.github/actions/run-e2e/action.yaml b/.github/actions/run-e2e/action.yaml
index ce60c6b84f8..f7ddc26aee4 100644
--- a/.github/actions/run-e2e/action.yaml
+++ b/.github/actions/run-e2e/action.yaml
@@ -35,7 +35,9 @@ runs:
env:
ARTIFACTS_DIR: ${{ runner.temp }}/e2e-artifacts
run: |
- set -x
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
mkdir "${ARTIFACTS_DIR}"
echo "ARTIFACTS_DIR=${ARTIFACTS_DIR}" | tee -a ${GITHUB_ENV}
- uses: actions/download-artifact@v3
@@ -45,7 +47,9 @@ runs:
- name: Load image
shell: bash
run: |
- set -x
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
unlz4 ~/operatorimage.tar.lz4 - | docker load
# docker looses the registry part on save/load
docker tag "$( echo "${image_repo_ref}:ci" | sed -E -e 's~[^/]+/(.*)~\1~' )" "${image_repo_ref}:ci"
@@ -57,7 +61,9 @@ runs:
- name: Install tools
shell: bash
run: |
- set -x
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
go install github.com/mikefarah/yq/v4@v4.6.1
- name: Setup minikube
uses: ./go/src/github.com/scylladb/scylla-operator/.github/actions/setup-minikube
@@ -67,7 +73,8 @@ runs:
env:
SCYLLA_OPERATOR_FEATURE_GATES: '${{ inputs.featureGates }}'
run: |
- set -x
+ set -euExo pipefail
+ shopt -s inherit_errexit
timeout 10m ./hack/ci-deploy.sh '${{ env.image_repo_ref }}:ci'
@@ -83,11 +90,15 @@ runs:
shell: bash
if: ${{ github.event_name != 'pull_request' }}
run: |
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
echo "FLAKE_ATTEMPTS=5" | tee -a ${GITHUB_ENV}
- name: Run e2e
shell: bash
run: |
set -euExo pipefail
+ shopt -s inherit_errexit
e2e_timeout_minutes='${{ inputs.baseTimeoutMinutes }}'
flake_attempts=0
@@ -96,7 +107,10 @@ runs:
e2e_timeout_minutes="$(( ${e2e_timeout_minutes} + ${flake_attempts} * 10 ))"
fi
- docker run --user="$( id -u ):$( id -g )" --rm \
+ user="$( id -u )"
+ group="$( id -g )"
+ ingress_address="$( kubectl -n haproxy-ingress get svc haproxy-ingress --template='{{ .spec.clusterIP }}' )"
+ docker run --user="${user}:${group}" --rm \
--entrypoint=/usr/bin/scylla-operator-tests \
-v="${ARTIFACTS_DIR}:${ARTIFACTS_DIR}:rw" \
-v="${HOME}/.kube/config:/kubeconfig:ro" -e='KUBECONFIG=/kubeconfig' \
@@ -106,13 +120,18 @@ runs:
--artifacts-dir="${ARTIFACTS_DIR}" \
--flake-attempts="${flake_attempts}" \
--timeout="${e2e_timeout_minutes}m" \
- --feature-gates='${{ inputs.featureGates }}'
+ --feature-gates='${{ inputs.featureGates }}' \
+ --override-ingress-address="${ingress_address}" \
${{ inputs.extraArgs }}
- name: Dump cluster state
if: ${{ always() }}
working-directory: ${{ runner.temp }}
shell: bash
- run: timeout 10m ${{ inputs.repositoryPath }}/hack/ci-gather-artifacts.sh
+ run: |
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
+ timeout 10m ${{ inputs.repositoryPath }}/hack/ci-gather-artifacts.sh
- name: Get machine logs and info
if: ${{ always() }}
working-directory: ${{ runner.temp }}/e2e-artifacts
@@ -142,7 +161,9 @@ runs:
working-directory: ${{ runner.temp }}/e2e-artifacts
shell: bash
run: |
- set -euEx -o pipefail
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
sudo cat $( ls /var/log/kube-apiserver-audit*.log | sort -n ) > ./kube-apiserver-audit.log
jq -s 'group_by(.user.username) | map({"user": .[0].user.username, "total": length, "verbs": (group_by(.verb) | map({"key":.[0].verb, "value": length}) | from_entries)}) | sort_by(.total) | reverse' ./kube-apiserver-audit.log > ./api-call-stats.json
- name: Compress artifacts
@@ -150,7 +171,9 @@ runs:
working-directory: ${{ runner.temp }}
shell: bash
run: |
- set -x
+ set -euExo pipefail
+ shopt -s inherit_errexit
+
tar -c --use-compress-program=lz4 -f ./e2e-artifacts.tar.lz4 "e2e-artifacts/"
- name: Upload artifacts
if: ${{ always() }}
diff --git a/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml b/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml
new file mode 100644
index 00000000000..133951f521d
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/admin-credentials.secret.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Secret
+metadata:
+ name: "{{ .name }}"
+data:
+ username: {{ "admin" | toBytes | toBase64 }}
+ password: {{ .password | toBase64 }}
diff --git a/assets/monitoring/grafana/v1alpha1/configs.cm.yaml b/assets/monitoring/grafana/v1alpha1/configs.cm.yaml
new file mode 100644
index 00000000000..522020b8742
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/configs.cm.yaml
@@ -0,0 +1,42 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana-configs"
+data:
+ grafana.ini: |
+ [auth]
+ disable_login_form = false
+ disable_signout_menu = false
+
+ {{ if .enableAnonymousAccess -}}
+ [auth.anonymous]
+ enabled = true
+ {{- end }}
+
+ [dashboards]
+ default_home_dashboard_path = /var/run/dashboards/scylladb/overview.json
+
+ [log]
+ level = error
+ mode = console
+
+ [log.frontend]
+ enabled = true
+
+ [paths]
+ data = /var/lib/grafana
+ logs = /var/log/grafana
+ plugins = /var/lib/grafana/plugins
+ provisioning = /var/run/configmaps/grafana-provisioning
+
+ [security]
+ admin_user = $__file{/var/run/secrets/grafana-admin-credentials/username}
+ admin_password = $__file{/var/run/secrets/grafana-admin-credentials/password}
+
+ [server]
+ protocol = https
+ cert_file = /var/run/secrets/grafana-serving-certs/tls.crt
+ cert_key = /var/run/secrets/grafana-serving-certs/tls.key
+
+ [panels]
+ disable_sanitize_html = true
diff --git a/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml b/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml
new file mode 100644
index 00000000000..015759f59f0
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/dashboards.cm.yaml
@@ -0,0 +1,4686 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: '{{ .scyllaDBMonitoringName }}-grafana-scylladb-dashboards'
+data:
+ overview.json: |-
+ {{`{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ },
+ {
+ "class": "annotation_manager_task",
+ "datasource": "prometheus",
+ "enable": true,
+ "expr": "scylla_manager_task_active_count{type=~\"repair|backup\",cluster=~\"$cluster|$^\"}>0",
+ "hide": false,
+ "iconColor": "#73BF69",
+ "limit": 100,
+ "name": "Task",
+ "showIn": 0,
+ "tagKeys": "type",
+ "tags": [],
+ "titleFormat": "Running",
+ "type": "tags"
+ },
+ {
+ "class": "mv_building",
+ "datasource": "prometheus",
+ "enable": true,
+ "expr": "sum(scylla_view_builder_builds_in_progress)>0",
+ "hide": false,
+ "iconColor": "rgb(50, 176, 0, 128)",
+ "limit": 100,
+ "name": "MV",
+ "showIn": 0,
+ "tagKeys": "instance,dc,cluster",
+ "tags": [],
+ "titleFormat": "Materialized View built",
+ "type": "tags"
+ },
+ {
+ "class": "ops_annotation",
+ "datasource": "prometheus",
+ "enable": true,
+ "expr": "10*min(scylla_node_ops_finished_percentage) by (ops, dc,instance) < 10",
+ "hide": false,
+ "iconColor": "rgb(50, 176, 0, 128)",
+ "limit": 100,
+ "name": "ops",
+ "showIn": 0,
+ "tagKeys": "ops,dc,instance",
+ "tags": [],
+ "titleFormat": "Operation",
+ "type": "tags"
+ },
+ {
+ "class": "annotation_schema_changed",
+ "datasource": "prometheus",
+ "enable": false,
+ "expr": "changes(scylla_database_schema_changed[$__rate_interval])>0",
+ "hide": false,
+ "iconColor": "rgba(255, 96, 96, 1)",
+ "limit": 100,
+ "name": "Schema Changed",
+ "showIn": 0,
+ "tagKeys": "instance,dc,cluster",
+ "tags": [],
+ "titleFormat": "schema changed",
+ "type": "tags"
+ }
+ ]
+ },
+ "class": "dashboard",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "hideControls": true,
+ "id": null,
+ "links": [
+ {
+ "asDropdown": true,
+ "icon": "external link",
+ "includeVars": true,
+ "keepTime": true,
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "originalTitle": "Scylla Cluster Metrics",
+ "overwrite": true,
+ "panels": [
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "panels": [],
+ "title": "Cluster overview $cluster",
+ "type": "row"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 1,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "si:"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 0,
+ "y": 1
+ },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_transport_requests_served{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[60s])) + (sum(rate(scylla_thrift_served{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[60s])) or on() vector(0))",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Requests/s",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "Average Write Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 50000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 3,
+ "y": 1
+ },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(wlatencya{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg Write",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "99% write Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 100000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 5,
+ "y": 1
+ },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(wlatencyp95{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "95% Write",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "99% write Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 100000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 7,
+ "y": 1
+ },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(wlatencyp99{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "99% Write",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "Average Read Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 50000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 9,
+ "y": 1
+ },
+ "id": 6,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(rlatencya{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg Read",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "99% read Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 100000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 11,
+ "y": 1
+ },
+ "id": 7,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(rlatencyp95{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "95% Read",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "99% read Latency",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 100000
+ }
+ ]
+ },
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 13,
+ "y": 1
+ },
+ "id": 8,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(rlatencyp99{by=\"cluster\", cluster=~\"$cluster|^$\",scheduling_group_name!=\"streaming\"}>0)",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "99% Read",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "description": "The percentage of the time during which Scylla utilized the CPU. Note that because Scylla does busy polling for some time before going idle, CPU utilization as seen by the operating system may be much higher. Your system is not yet CPU-bottlenecked until this metric is high.",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 15,
+ "y": 1
+ },
+ "id": 9,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"} )",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 17,
+ "y": 1
+ },
+ "id": 10,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_database_total_reads_failed{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "R Failed",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 19,
+ "y": 1
+ },
+ "id": 11,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_database_total_writes_failed{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "W Failed",
+ "type": "stat"
+ },
+ {
+ "class": "small_stat",
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 21,
+ "y": 1
+ },
+ "id": 12,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_storage_proxy_coordinator_write_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[1m]))",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Timeouts",
+ "type": "stat"
+ },
+ {
+ "class": "alert_table",
+ "columns": [],
+ "datasource": "alertmanager",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": null,
+ "filterable": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute"
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Time"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 150
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "instance"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 100
+ }
+ ]
+ }
+ ]
+ },
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 5
+ },
+ "id": 13,
+ "links": [],
+ "options": {
+ "showHeader": true
+ },
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 0,
+ "desc": true
+ },
+ "span": 4,
+ "targets": [
+ {
+ "active": true,
+ "annotations": true,
+ "filters": "job!=\"scylla_manager\",advisor=\"\"",
+ "legendFormat": "{{description}}",
+ "refId": "A",
+ "target": "Query"
+ }
+ ],
+ "title": "Active Alerts",
+ "transform": "table",
+ "transformations": [
+ {
+ "id": "filterFieldsByName",
+ "options": {
+ "include": {
+ "names": [
+ "Time",
+ "summary",
+ "instance"
+ ]
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {},
+ "indexByName": {
+ "Time": 0,
+ "instance": 1,
+ "summary": 2
+ },
+ "renameByName": {}
+ }
+ }
+ ],
+ "type": "table"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Write attempts - include all writes that reached the coordinator node, even if they will eventually fail",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 8,
+ "y": 5
+ },
+ "id": 14,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "Writes",
+ "type": "timeseries"
+ },
+ {
+ "class": "us_panel",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 12,
+ "y": 5
+ },
+ "id": 15,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "avg(wlatencyp95{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)",
+ "intervalFactor": 1,
+ "legendFormat": "95%",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "avg(wlatencyp99{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)",
+ "intervalFactor": 1,
+ "legendFormat": "99%",
+ "refId": "B",
+ "step": 1
+ }
+ ],
+ "title": "Write Latencies",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Read attempts - include all reads that reached the coordinator node, even if they will eventually fail",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 16,
+ "y": 5
+ },
+ "id": 16,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "Reads",
+ "type": "timeseries"
+ },
+ {
+ "class": "us_panel",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 20,
+ "y": 5
+ },
+ "id": 17,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "avg(rlatencyp95{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)",
+ "intervalFactor": 1,
+ "legendFormat": "95%",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "avg(rlatencyp99{by=\"cluster\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)",
+ "intervalFactor": 1,
+ "legendFormat": "99%",
+ "refId": "B",
+ "step": 1
+ }
+ ],
+ "title": "Read Latencies",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 11
+ },
+ "id": 18,
+ "panels": [],
+ "title": "",
+ "type": "row"
+ },
+ {
+ "class": "plain_text",
+ "content": "
Advisor
",
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 24,
+ "x": 0,
+ "y": 12
+ },
+ "id": 19,
+ "isNew": true,
+ "links": [],
+ "mode": "html",
+ "options": {},
+ "span": 12,
+ "style": {},
+ "title": "",
+ "transparent": true,
+ "type": "text"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 12,
+ "x": 12,
+ "y": 14
+ },
+ "id": 20,
+ "options": {
+ "content": "Balance
\nAn Imbalance between shards or nodes may indicates a potential problem",
+ "mode": "html"
+ },
+ "pluginVersion": "7.3.4",
+ "targets": [
+ {
+ "queryType": "randomWalk",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "",
+ "type": "text"
+ },
+ {
+ "class": "advisor_table",
+ "dashversion": ">4.1",
+ "datasource": "alertmanager",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "align": null,
+ "filterable": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "dashboard"
+ },
+ "properties": [
+ {
+ "id": "links",
+ "value": [
+ {
+ "title": "",
+ "url": "/d/${__data.fields.dashboard}-[[dash_version]]?refresh=30s&orgId=1&var-by=instance&from=${__from}&to=${__to}"
+ }
+ ]
+ },
+ {
+ "id": "custom.width",
+ "value": 100
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "advisor"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 120
+ },
+ {
+ "id": "displayName",
+ "value": "Category"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "severity"
+ },
+ "properties": [
+ {
+ "id": "links",
+ "value": [
+ {
+ "targetBlank": true,
+ "title": "Open an issue",
+ "url": "https://github.com/scylladb/scylla/issues/new?body=description%3D${__data.fields[4]}%0ASource%3DAdvisor%0AScylla-versions%3D${all_scyllas_versions}%0Ascylla-monitoring%3D${monitoring_version}%0Acluster%3D${count_dc}%0Aname%3D${cluster}%0A%0A"
+ }
+ ]
+ },
+ {
+ "id": "mappings",
+ "value": [
+ {
+ "from": "0",
+ "id": 1,
+ "text": "\ud83d\udd14",
+ "to": "10",
+ "type": 2,
+ "value": ""
+ }
+ ]
+ },
+ {
+ "id": "displayName",
+ "value": "Report"
+ },
+ {
+ "id": "custom.width",
+ "value": 65
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "summary"
+ },
+ "properties": [
+ {
+ "id": "links",
+ "value": [
+ {
+ "targetBlank": true,
+ "title": "${__data.fields.description}\n\n click for more information",
+ "url": "https://monitoring.docs.scylladb.com/branch-master/use-monitoring/advisor/${__data.fields.alertname}"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "alertname"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "."
+ },
+ {
+ "id": "custom.width",
+ "value": 1
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "description"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "."
+ },
+ {
+ "id": "custom.width",
+ "value": 1
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Time"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 150
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "id": 21,
+ "links": [],
+ "options": {
+ "showHeader": true
+ },
+ "pluginVersion": "7.3.4",
+ "targets": [
+ {
+ "active": true,
+ "annotations": true,
+ "filters": "advisor!=\"\"",
+ "legendFormat": "{{description}}",
+ "refId": "A",
+ "target": "Query"
+ }
+ ],
+ "title": "",
+ "transformations": [
+ {
+ "id": "filterFieldsByName",
+ "options": {
+ "include": {
+ "names": [
+ "advisor",
+ "dashboard",
+ "description",
+ "severity",
+ "alertname",
+ "summary",
+ "Time"
+ ]
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {},
+ "indexByName": {
+ "Time": 1,
+ "advisor": 2,
+ "dashboard": 3,
+ "severity": 0,
+ "summary": 4
+ },
+ "renameByName": {}
+ }
+ }
+ ],
+ "type": "table"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 12,
+ "y": 16
+ },
+ "id": 22,
+ "links": [
+ {
+ "title": "The number of connections per shard should be balanced"
+ }
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "max(abs(sum(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by (instance,shard)-scalar(avg(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"})))) - 8",
+ "hide": false,
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Connections",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 14,
+ "y": 16
+ },
+ "id": 23,
+ "links": [
+ {
+ "title": "Indicates that the number of CQL operations (inserts, updates, deletes, reads) is not balance between shards in one of the nodes"
+ }
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "max(abs(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_updates{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "max(abs(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_inserts{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "B"
+ },
+ {
+ "expr": "max(abs(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_reads{ dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "C"
+ },
+ {
+ "expr": "max(abs(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m]) - on(dc) group_left avg(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m])) by (dc))/on(dc) group_left sum(stddev(rate(scylla_cql_deletes{conditional=\"no\", dc=~\"$dc\"}[1m])) by(dc)+100) by(dc))-3",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "D"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CQL OPs",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 16,
+ "y": 16
+ },
+ "id": 24,
+ "links": [
+ {
+ "title": "A single node with higher latency is an indication for a node related issue"
+ }
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "((max(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(wlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)",
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "((max(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(rlatencyp99{by=\"instance\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)",
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Node Latency",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 18,
+ "y": 16
+ },
+ "id": 25,
+ "links": [
+ {
+ "title": "A single shard with high latency is an indication of a hot-partition, or a large row/cell/partition"
+ }
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "((max(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(wlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)",
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "((max(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"})-scalar(avg(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0)))/(scalar(stddev(rlatencyp99{by=\"instance,shard\", cluster=~\"$cluster|$^\",scheduling_group_name!=\"streaming\"}>0))+100)-3)",
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Shard Latency",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "A shard that reads more from the cache could be an indication for hot partition",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 20,
+ "y": 16
+ },
+ "id": 26,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "((rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))- scalar(avg(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))))/scalar(stddev(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]) - rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m]))+100)-3",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cache",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "class": "small_stat_error",
+ "datasource": "prometheus",
+ "description": "A single shard that reads more from sstables, could be that a node is slow",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [
+ {
+ "from": "-1000",
+ "id": 1,
+ "text": "\u2713",
+ "to": "0.001",
+ "type": 2,
+ "value": ""
+ },
+ {
+ "from": "0.001",
+ "id": 2,
+ "text": "\u26a0",
+ "to": "10000",
+ "type": 2,
+ "value": "0.001"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 0.001
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 22,
+ "y": 16
+ },
+ "id": 27,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "max(abs(scylla_database_active_reads{ dc=~\"$dc\"} - scalar(avg(scylla_database_active_reads{ dc=~\"$dc\"})))/scalar(stddev(scylla_database_active_reads{ dc=~\"$dc\"})+0.001))-3",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "SSTable",
+ "transformations": [
+ {
+ "id": "calculateField",
+ "options": {
+ "mode": "reduceRow",
+ "reduce": {
+ "reducer": "max"
+ },
+ "replaceFields": true
+ }
+ }
+ ],
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 22
+ },
+ "id": 28,
+ "panels": [],
+ "repeat": "dc",
+ "title": "",
+ "type": "row"
+ },
+ {
+ "class": "plain_text",
+ "content": "Information for $dc
",
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 24,
+ "x": 0,
+ "y": 23
+ },
+ "id": 29,
+ "isNew": true,
+ "links": [],
+ "mode": "html",
+ "options": {},
+ "span": 12,
+ "style": {},
+ "title": "",
+ "transparent": true,
+ "type": "text"
+ },
+ {
+ "class": "vertical_lcd",
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "orange",
+ "value": 85
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 1,
+ "x": 0,
+ "y": 25
+ },
+ "id": 30,
+ "options": {
+ "displayMode": "lcd",
+ "orientation": "vertical",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showUnfilled": true
+ },
+ "pluginVersion": "7.1.3",
+ "targets": [
+ {
+ "expr": "avg(scylla_reactor_utilization{cluster=~\"$cluster\", dc=~\"$dc\"} )",
+ "instant": true,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load",
+ "type": "bargauge"
+ },
+ {
+ "class": "bytes_panel",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 3,
+ "x": 1,
+ "y": 25
+ },
+ "id": 31,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "span": 5,
+ "targets": [
+ {
+ "expr": "Avg(node_filesystem_size_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])-avg(node_filesystem_avail_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Usage {{[[by]]}}",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "avg(node_filesystem_size_bytes{mountpoint=\"$mount_point\", dc=~\"$dc\"}) by ([[by]])",
+ "interval": "",
+ "legendFormat": "Size {{[[by]]}}",
+ "refId": "B"
+ }
+ ],
+ "title": "Disk Size by $by",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel_int",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 25
+ },
+ "id": 32,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "span": 2,
+ "targets": [
+ {
+ "expr": "$func(scylla_compaction_manager_compactions{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "Running Compactions",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "The Hits and Misses",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 8,
+ "y": 25
+ },
+ "id": 33,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_cache_row_hits{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Hit {{[[by]]}}",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "$func(rate(scylla_cache_row_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Misses {{[[by]]}}",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "title": "Cache Rows Hits/Misses",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Write attempts - include all writes that reached the coordinator node, even if they will eventually fail",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 14,
+ "y": 25
+ },
+ "id": 34,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {
+ "alias": "1 Day Ago",
+ "dashLength": 4,
+ "dashes": true
+ },
+ {
+ "alias": "1 Week Ago",
+ "dashLength": 2,
+ "dashes": true
+ }
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m] offset 1d))",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "1 Day Ago",
+ "refId": "B",
+ "step": 1
+ },
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_write_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m] offset 1w))",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "1 Week Ago",
+ "refId": "C",
+ "step": 1
+ }
+ ],
+ "title": "Writes",
+ "type": "timeseries"
+ },
+ {
+ "class": "us_panel",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 20,
+ "y": 25
+ },
+ "id": 35,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "avg(wlatencyp95{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "95% {{[[by]]}}",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "avg(wlatencyp99{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "99% {{[[by]]}}",
+ "refId": "B",
+ "step": 1
+ }
+ ],
+ "title": "Write Latencies",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Requests that Scylla tried to write but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas.",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 0,
+ "y": 31
+ },
+ "id": 36,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_write_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Writes {{[[by]]}}",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "title": "Write Timeouts by [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Requests that Scylla tried to read but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas.",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 31
+ },
+ "id": 37,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_read_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])+rate(scylla_storage_proxy_coordinator_cas_read_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])+rate(scylla_storage_proxy_coordinator_range_timeouts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Read {{[[by]]}}",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "title": "Read Timeouts by [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "The Hits and Misses",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 8,
+ "y": 31
+ },
+ "id": 38,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_cache_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-$func(rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Hit {{[[by]]}}",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "$func(rate(scylla_cache_reads_with_misses{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Misses {{[[by]]}}",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "title": "Cache Reads Hits/Misses",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Read attempts - include all reads that reached the coordinator node, even if they will eventually fail",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 14,
+ "y": 31
+ },
+ "id": 39,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {
+ "alias": "1 Day Ago",
+ "dashLength": 4,
+ "dashes": true
+ },
+ {
+ "alias": "1 Week Ago",
+ "dashLength": 2,
+ "dashes": true
+ }
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m] offset 1d))",
+ "intervalFactor": 1,
+ "legendFormat": "1 Day Ago",
+ "refId": "B",
+ "step": 1
+ },
+ {
+ "expr": "$func(rate(scylla_storage_proxy_coordinator_read_latency_count{cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}[1m] offset 1w))",
+ "intervalFactor": 1,
+ "legendFormat": "1 Week Ago",
+ "refId": "C",
+ "step": 1
+ }
+ ],
+ "title": "Reads",
+ "type": "timeseries"
+ },
+ {
+ "class": "us_panel",
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "\u00b5s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 20,
+ "y": 31
+ },
+ "id": 40,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "class": "show_legend",
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 2,
+ "targets": [
+ {
+ "expr": "avg(rlatencyp95{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "95% {{[[by]]}}",
+ "refId": "A",
+ "step": 1
+ },
+ {
+ "expr": "avg(rlatencyp99{by=\"[[by]]\", cluster=~\"$cluster|$^\", dc=~\"$dc\",scheduling_group_name!=\"streaming\"}>0) by([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "99% {{[[by]]}}",
+ "refId": "B",
+ "step": 1
+ }
+ ],
+ "title": "Read Latencies",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL INSERT commands generated by the user",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 0,
+ "y": 37
+ },
+ "id": 41,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_inserts{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]]) - sum(rate(scylla_cql_inserts_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "CQL Insert",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL SELECT commands generated by the user",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 6,
+ "y": 37
+ },
+ "id": 42,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]]) - sum(rate(scylla_cql_reads_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "CQL Reads",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL DELETE commands generated by the user",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 12,
+ "y": 37
+ },
+ "id": 43,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_deletes{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-sum(rate(scylla_cql_deletes_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "CQL Deletes",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL UPDATE commands generated by the user",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 18,
+ "y": 37
+ },
+ "id": 44,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_updates{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])-sum(rate(scylla_cql_updates_per_ks{cluster=~\"$cluster|$^\", dc=~\"$dc\", who=\"internal\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 1
+ }
+ ],
+ "title": "CQL Updates",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "datasource": "prometheus",
+ "description": "amount of CQL connections currently established",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 0,
+ "y": 43
+ },
+ "id": 45,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(scylla_transport_current_connections{cluster=~\"$cluster|$^\", dc=~\"$dc\"}) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "Client CQL connections by [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL batches command, each batched command is counted once",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 6,
+ "y": 43
+ },
+ "id": 46,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_batches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "CQL Batches by [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL command batched. Each batch would add the number of commands inside the batch",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 12,
+ "y": 43
+ },
+ "id": 47,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_statements_in_batches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "CQL Command In Batches by [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "ops_panel",
+ "datasource": "prometheus",
+ "description": "Counts the number of SELECT statements with BYPASS CACHE option",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 18,
+ "y": 43
+ },
+ "id": 48,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_select_bypass_caches{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "refId": "A"
+ }
+ ],
+ "title": "BYPASS CACHE",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "dashversion": [
+ ">4.4",
+ ">2021.1"
+ ],
+ "datasource": "prometheus",
+ "description": "CQL errors by type, only active errors are shown",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 0,
+ "y": 49
+ },
+ "id": 49,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_transport_cql_errors_total{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]],type) >0",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "CQL Errors [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "datasource": "prometheus",
+ "description": "Number of CQL row reads",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 6,
+ "y": 49
+ },
+ "id": 50,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_rows_read{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "CQL Row Reads [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "graph_panel",
+ "datasource": "prometheus",
+ "description": "Number of reads using secondary indexes",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 12,
+ "y": 49
+ },
+ "id": 51,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "pointradius": 1,
+ "span": 3,
+ "targets": [
+ {
+ "expr": "sum(rate(scylla_cql_secondary_index_reads{cluster=~\"$cluster|$^\", dc=~\"$dc\"}[1m])) by ([[by]])",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "metric": "",
+ "refId": "A",
+ "step": 30
+ }
+ ],
+ "title": "Secondary indexes Reads [[by]]",
+ "type": "timeseries"
+ },
+ {
+ "class": "collapsible_row_panel",
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 55
+ },
+ "id": 52,
+ "panels": [],
+ "repeat": "",
+ "title": "Your panels",
+ "type": "row"
+ },
+ {
+ "class": "plain_text",
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 24,
+ "x": 0,
+ "y": 56
+ },
+ "id": 53,
+ "isNew": true,
+ "links": [],
+ "mode": "html",
+ "options": {
+ "content": "Your Panels
",
+ "mode": "html"
+ },
+ "span": 12,
+ "style": {},
+ "title": "",
+ "transparent": true,
+ "type": "text"
+ },
+ {
+ "class": "user_panel",
+ "datasource": "prometheus",
+ "description": "This graph panel was left empty on purpose for ad-hoc usage. Change it when needed. Pay attention that changes to the panel will not be saved.\n\nIf you do need a panel that can be saved, create a new dashboard, or edit the panel inside the json file",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 58
+ },
+ "id": 54,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 6,
+ "title": "Your Graph here",
+ "type": "timeseries"
+ },
+ {
+ "class": "user_panel",
+ "datasource": "prometheus",
+ "description": "This graph panel was left empty on purpose for ad-hoc usage. Change it when needed. Pay attention that changes to the panel will not be saved.\n\nIf you do need a panel that can be saved, create a new dashboard, or edit the panel inside the json file",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "class": "fieldConfig_defaults",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMin": 0,
+ "barAlignment": 0,
+ "class": "fieldConfig_defaults_custom",
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "unit": "si:ops/s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 12,
+ "y": 58
+ },
+ "id": 55,
+ "isNew": true,
+ "links": [],
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "asc"
+ }
+ },
+ "seriesOverrides": [
+ {}
+ ],
+ "span": 6,
+ "title": "Your Graph here",
+ "type": "timeseries"
+ },
+ {
+ "class": "plain_text",
+ "datasource": null,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 24,
+ "x": 0,
+ "y": 64
+ },
+ "id": 56,
+ "isNew": true,
+ "links": [],
+ "mode": "html",
+ "options": {
+ "content": "Scylla Monitoring version - master
",
+ "mode": "html"
+ },
+ "span": 12,
+ "style": {},
+ "title": "",
+ "transparent": true,
+ "type": "text"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "class": "by_template_var",
+ "current": {
+ "tags": [],
+ "text": "DC",
+ "value": "dc"
+ },
+ "error": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "by",
+ "multi": false,
+ "name": "by",
+ "options": [
+ {
+ "selected": false,
+ "text": "Cluster",
+ "value": "cluster"
+ },
+ {
+ "selected": true,
+ "text": "DC",
+ "value": "dc"
+ }
+ ],
+ "query": "Cluster,DC,Instance,Shard",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "allValue": null,
+ "class": "template_variable_single",
+ "current": {
+ "isNone": true,
+ "selected": false,
+ "text": "None",
+ "value": ""
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "error": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "cluster",
+ "multi": false,
+ "name": "cluster",
+ "options": [],
+ "query": "label_values(scylla_reactor_utilization, cluster)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "class": "template_variable_all",
+ "current": {
+ "selected": true,
+ "text": [
+ "All"
+ ],
+ "value": [
+ "$__all"
+ ]
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "error": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": "dc",
+ "multi": true,
+ "name": "dc",
+ "options": [],
+ "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "class": "template_variable_single",
+ "current": {
+ "text": "/var/lib/scylla",
+ "value": "/var/lib/scylla"
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "error": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "Mount path",
+ "multi": false,
+ "name": "mount_point",
+ "options": [
+ {
+ "selected": true,
+ "text": "/var/lib/scylla",
+ "value": "/var/lib/scylla"
+ }
+ ],
+ "query": "/var/lib/scylla",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "custom",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "class": "aggregation_function",
+ "current": {
+ "tags": [],
+ "text": "sum",
+ "value": "sum"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Function",
+ "multi": false,
+ "name": "func",
+ "options": [
+ {
+ "selected": true,
+ "text": "sum",
+ "value": "sum"
+ },
+ {
+ "selected": false,
+ "text": "avg",
+ "value": "avg"
+ },
+ {
+ "selected": false,
+ "text": "max",
+ "value": "max"
+ },
+ {
+ "selected": false,
+ "text": "min",
+ "value": "min"
+ },
+ {
+ "selected": false,
+ "text": "stddev",
+ "value": "stddev"
+ },
+ {
+ "selected": false,
+ "text": "stdvar",
+ "value": "stdvar"
+ }
+ ],
+ "query": "sum,avg,max,min,stddev,stdvar",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "allValue": null,
+ "class": "template_variable_all",
+ "current": {
+ "selected": true,
+ "text": [
+ "All"
+ ],
+ "value": [
+ "$__all"
+ ]
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "error": null,
+ "hide": 2,
+ "includeAll": true,
+ "multi": true,
+ "name": "all_scyllas_versions",
+ "options": [],
+ "query": "label_values(scylla_scylladb_current_version{cluster=~\"$cluster|$^\"}, version)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "class": "template_variable_all",
+ "current": {
+ "selected": true,
+ "text": [
+ "All"
+ ],
+ "value": [
+ "$__all"
+ ]
+ },
+ "datasource": "prometheus",
+ "definition": "query_result(count(up{job=\"scylla\"}) by (dc))",
+ "error": null,
+ "hide": 2,
+ "includeAll": true,
+ "multi": true,
+ "name": "count_dc",
+ "options": [],
+ "query": {
+ "query": "query_result(count(up{job=\"scylla\"}) by (dc))",
+ "refId": "StandardVariableQuery"
+ },
+ "refresh": 2,
+ "regex": "/(?\\{dc=\"[^\"]+\".* \\d+) .*/",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "class": "monitor_version_var",
+ "current": {
+ "text": "master",
+ "value": "master"
+ },
+ "error": null,
+ "hide": 2,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "monitoring_version",
+ "options": [
+ {
+ "selected": true,
+ "text": "master",
+ "value": "master"
+ }
+ ],
+ "query": "master",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-30m",
+ "to": "now"
+ },
+ "timepicker": {
+ "now": true,
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "utc",
+ "title": "CQL Overview",
+ "uid": "cql-overview",
+ "version": 1
+ }`}}
diff --git a/assets/monitoring/grafana/v1alpha1/deployment.yaml b/assets/monitoring/grafana/v1alpha1/deployment.yaml
new file mode 100644
index 00000000000..000d0af9233
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/deployment.yaml
@@ -0,0 +1,124 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana"
+spec:
+ selector:
+ matchLabels:
+ scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana"
+ strategy:
+ type: RollingUpdate
+ template:
+ metadata:
+ annotations:
+ scylla-operator.scylladb.com/inputs-hash: "{{ .restartTriggerHash }}"
+ labels:
+ scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana"
+ spec:
+ securityContext:
+ fsGroup: 472
+ supplementalGroups:
+ - 0
+ affinity:
+ {{- .affinity | toYAML | nindent 8 }}
+ tolerations:
+ {{- .tolerations | toYAML | nindent 8 }}
+ containers:
+ - name: grafana
+ image: docker.io/grafana/grafana:9.3.1
+ command:
+ - grafana-server
+ - --packaging=docker
+ - --homepath=/usr/share/grafana
+ - --config=/var/run/configmaps/grafana-configs/grafana.ini
+ env:
+ - name: GF_PATHS_PROVISIONING
+ - name: GF_PATHS_HOME
+ - name: GF_PATHS_DATA
+ - name: GF_PATHS_LOGS
+ - name: GF_PATHS_PLUGINS
+ - name: GF_PATHS_CONFIG
+ ports:
+ - containerPort: 3000
+ name: grafana
+ protocol: TCP
+ readinessProbe:
+ initialDelaySeconds: 10
+ periodSeconds: 30
+ timeoutSeconds: 5
+ successThreshold: 1
+ failureThreshold: 1
+ httpGet:
+ path: /api/health
+ port: 3000
+ scheme: HTTPS
+ livenessProbe:
+ initialDelaySeconds: 30
+ periodSeconds: 30
+ timeoutSeconds: 5
+ successThreshold: 1
+ failureThreshold: 10
+ httpGet:
+ path: /api/health
+ port: 3000
+ scheme: HTTPS
+ resources:
+ {{- .resources | toYAML | nindent 10 }}
+ volumeMounts:
+ - name: grafana-configs
+ mountPath: /var/run/configmaps/grafana-configs
+ - name: grafana-scylladb-dashboards
+ mountPath: /var/run/dashboards/scylladb
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/access-control/access-control.yaml
+ subPath: access-control.yaml
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/alerting/alerting.yaml
+ subPath: alerting.yaml
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/dashboards/dashboards.yaml
+ subPath: dashboards.yaml
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/datasources/datasources.yaml
+ subPath: datasources.yaml
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/notifiers/notifiers.yaml
+ subPath: notifiers.yaml
+ - name: grafana-provisioning
+ mountPath: /var/run/configmaps/grafana-provisioning/plugins/plugins.yaml
+ subPath: plugins.yaml
+ - name: grafana-admin-credentials
+ mountPath: /var/run/secrets/grafana-admin-credentials
+ - name: grafana-serving-certs
+ mountPath: /var/run/secrets/grafana-serving-certs
+ - name: prometheus-client-certs
+ mountPath: /var/run/secrets/prometheus-client-certs
+ - name: prometheus-serving-ca
+ mountPath: /var/run/configmaps/prometheus-serving-ca
+ - name: grafana-storage
+ mountPath: /var/lib/grafana
+ volumes:
+ - name: grafana-configs
+ configMap:
+ name: "{{ .scyllaDBMonitoringName }}-grafana-configs"
+ - name: grafana-scylladb-dashboards
+ configMap:
+ name: "{{ .scyllaDBMonitoringName }}-grafana-scylladb-dashboards"
+ - name: grafana-provisioning
+ configMap:
+ name: "{{ .scyllaDBMonitoringName }}-grafana-provisioning"
+ - name: grafana-admin-credentials
+ secret:
+ secretName: "{{ .scyllaDBMonitoringName }}-grafana-admin-credentials"
+ - name: grafana-serving-certs
+ secret:
+ secretName: "{{ .servingCertSecretName }}"
+ - name: prometheus-client-certs
+ secret:
+ secretName: "{{ .scyllaDBMonitoringName }}-prometheus-client-grafana"
+ - name: prometheus-serving-ca
+ configMap:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-ca"
+ - name: grafana-storage
+ emptyDir:
+ sizeLimit: 100Mi
diff --git a/assets/monitoring/grafana/v1alpha1/ingress.yaml b/assets/monitoring/grafana/v1alpha1/ingress.yaml
new file mode 100644
index 00000000000..f589f0cf07b
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/ingress.yaml
@@ -0,0 +1,21 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana"
+ annotations:
+ {{- .ingressAnnotations | toYAML | nindent 4 }}
+spec:
+ ingressClassName: {{ or .ingressClassName "null" }}
+ rules:
+ {{- range $_, $dnsDomain := .dnsDomains }}
+ - host: "{{ $dnsDomain }}"
+ http:
+ paths:
+ - backend:
+ service:
+ name: "{{ $.scyllaDBMonitoringName }}-grafana"
+ port:
+ number: 3000
+ path: /
+ pathType: Prefix
+ {{- end }}
diff --git a/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml b/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml
new file mode 100644
index 00000000000..91fc8662913
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/provisioning.cm.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana-provisioning"
+data:
+ access-control.yaml: ""
+ alerting.yaml: ""
+ dashboards.yaml: |
+ apiVersion: 1
+ providers:
+ - name: dashboards
+ type: file
+ updateIntervalSeconds: 30
+ options:
+ path: /var/run/dashboards
+ foldersFromFilesStructure: true
+ datasources.yaml: |
+ apiVersion: 1
+ datasources:
+ - name: prometheus
+ type: prometheus
+ access: proxy
+ url: "https://{{ .scyllaDBMonitoringName }}-prometheus:9090"
+ isDefault: true
+ version: 1
+ editable: false
+ jsonData:
+ timeInterval: "5s"
+ tlsAuthWithCACert: true
+ secureJsonData:
+ tlsCACert: "$__file{/var/run/configmaps/prometheus-serving-ca/ca-bundle.crt}"
+ tlsClientCert: "$__file{/var/run/secrets/prometheus-client-certs/tls.crt}"
+ tlsClientKey: "$__file{/var/run/secrets/prometheus-client-certs/tls.key}"
+ notifiers.yaml: ""
+ plugins.yaml: ""
diff --git a/assets/monitoring/grafana/v1alpha1/registry.go b/assets/monitoring/grafana/v1alpha1/registry.go
new file mode 100644
index 00000000000..c56f4b24d58
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/registry.go
@@ -0,0 +1,50 @@
+package v1alpha1
+
+import (
+ _ "embed"
+
+ "github.com/scylladb/scylla-operator/pkg/assets"
+ "github.com/scylladb/scylla-operator/pkg/scheme"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+)
+
+func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] {
+ return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer())
+}
+
+var (
+ //go:embed "deployment.yaml"
+ grafanaDeploymentTemplateString string
+ GrafanaDeploymentTemplate = ParseObjectTemplateOrDie[*appsv1.Deployment]("grafana-deployment", grafanaDeploymentTemplateString)
+
+ //go:embed "serviceaccount.yaml"
+ grafanaSATemplateString string
+ GrafanaSATemplate = ParseObjectTemplateOrDie[*corev1.ServiceAccount]("grafana-sa", grafanaSATemplateString)
+
+ //go:embed "configs.cm.yaml"
+ grafanaConfigsTemplateString string
+ GrafanaConfigsTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-configs-cm", grafanaConfigsTemplateString)
+
+ //go:embed "admin-credentials.secret.yaml"
+ grafanaAdminCredentialsSecretTemplateString string
+ GrafanaAdminCredentialsSecretTemplate = ParseObjectTemplateOrDie[*corev1.Secret]("grafana-access-credentials-secret", grafanaAdminCredentialsSecretTemplateString)
+
+ //go:embed "provisioning.cm.yaml"
+ grafanaProvisioningConfigMapTemplateString string
+ GrafanaProvisioningConfigMapTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-provisioning-cm", grafanaProvisioningConfigMapTemplateString)
+
+ //go:embed "dashboards.cm.yaml"
+ grafanaDashboardsConfigMapTemplateString string
+ GrafanaDashboardsConfigMapTemplate = ParseObjectTemplateOrDie[*corev1.ConfigMap]("grafana-dashboard-cm", grafanaDashboardsConfigMapTemplateString)
+
+ //go:embed "service.yaml"
+ grafanaServiceTemplateString string
+ GrafanaServiceTemplate = ParseObjectTemplateOrDie[*corev1.Service]("grafana-service", grafanaServiceTemplateString)
+
+ //go:embed "ingress.yaml"
+ grafanaIngressTemplateString string
+ GrafanaIngressTemplate = ParseObjectTemplateOrDie[*networkingv1.Ingress]("grafana-ingress", grafanaIngressTemplateString)
+)
diff --git a/assets/monitoring/grafana/v1alpha1/service.yaml b/assets/monitoring/grafana/v1alpha1/service.yaml
new file mode 100644
index 00000000000..895913892e9
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana"
+spec:
+ type: ClusterIP
+ selector:
+ scylla-operator.scylladb.com/deployment-name: "{{ .scyllaDBMonitoringName }}-grafana"
+ ports:
+ - port: 3000
+ protocol: TCP
+ targetPort: grafana
diff --git a/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml b/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml
new file mode 100644
index 00000000000..53ba45d54e2
--- /dev/null
+++ b/assets/monitoring/grafana/v1alpha1/serviceaccount.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-grafana"
diff --git a/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml b/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml
new file mode 100644
index 00000000000..66a7151997d
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/alerts.prometheusrule.yaml
@@ -0,0 +1,295 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-alerts"
+ labels:
+ scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}"
+spec:
+ groups:
+ - name: scylla.rules
+ rules: {{`
+ - alert: cqlNonPrepared
+ expr: cql:non_prepared > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries are non-prepared'
+ summary: non prepared statments
+ - alert: cql:non_paged_no_system
+ expr: cql:non_paged > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ status: "1"
+ annotations:
+ description: 'Some SELECT queries are non-paged'
+ summary: non paged statments
+ - alert: cqlNoTokenAware
+ expr: cql:non_token_aware > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries are not token-aware'
+ summary: non token aware statments
+ - alert: cqlReverseOrder
+ expr: cql:reverse_queries > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries use reverse order'
+ summary: reverse order queries
+ - alert: cqlAllowFiltering
+ expr: cql:allow_filtering > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries use ALLOW FILTERING'
+ summary: Allow filtering queries
+ - alert: cqlCLAny
+ expr: cql:any_queries > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries use Consistency Level: ANY'
+ summary: non prepared statments
+ - alert: cqlCLAll
+ expr: cql:all_queries > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "cqlOptimization"
+ dashboard: "cql"
+ annotations:
+ description: 'Some queries use Consistency Level: ALL'
+ summary: non prepared statments
+ - alert: nonBalancedcqlTraffic
+ expr: abs(rate(scylla_cql_updates{conditional="no"}[1m]) - scalar(avg(rate(scylla_cql_updates{conditional="no"}[1m]))))/scalar(stddev(rate(scylla_cql_updates{conditional="no"}[1m]))+100) > 2
+ for: 10s
+ labels:
+ severity: "1"
+ status: "1"
+ advisor: "balanced"
+ dashboard: "cql"
+ annotations:
+ description: 'CQL queries are not balanced among shards {{ $labels.instance }} shard {{ $labels.shard }}'
+ summary: CQL queries are not balanced
+ - alert: nodeLocalErrors
+ expr: sum(errors:local_failed) by (cluster, instance) > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "operationError"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'Some operation failed at the replica side'
+ summary: Replica side Level error
+ - alert: nodeIOErrors
+ expr: sum(rate(scylla_reactor_aio_errors[60s])) by (cluster, instance) > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "operationError"
+ dashboard: "OS-master"
+ annotations:
+ description: 'IO Errors can indicate a node with a faulty disk {{ $labels.instance }}'
+ summary: IO Disk Error
+ - alert: nodeCLErrors
+ expr: sum(errors:operation_unavailable) by (cluster) > 0
+ for: 10s
+ labels:
+ severity: "1"
+ advisor: "operationError"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'Some operation failed due to consistency level'
+ summary: Consistency Level error
+ - alert: preparedCacheEviction
+ expr: sum(rate(scylla_cql_prepared_cache_evictions[2m])) by (cluster) + sum(rate(scylla_cql_authorized_prepared_statements_cache_evictions[2m])) by (cluster) > 100
+ for: 5m
+ labels:
+ severity: "1"
+ advisor: "preparedEviction"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'The prepared-statement cache is being continuously evicted, which could indicate a problem in your prepared-statement usage logic.'
+ summary: Prepared cache eviction
+ - alert: heavyCompaction
+ expr: max(scylla_scheduler_shares{group="compaction"}) by (cluster) >= 1000
+ for: 20m
+ labels:
+ severity: "1"
+ advisor: "heavyCompaction"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'Compaction load increases to a level it can interfere with the system behaviour. If this persists set the compaction share to a static level.'
+ summary: Heavy compaction load
+ - alert: shedRequests
+ expr: max(sum(rate(scylla_transport_requests_shed[60s])) by (instance,cluster)/sum(rate(scylla_transport_requests_served{}[60s])) by (instance, cluster)) by(cluster) > 0.01
+ for: 5m
+ labels:
+ severity: "1"
+ advisor: "systemOverload"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'More than 1% of the requests got shed, this is an indication of an overload, consider system resize.'
+ summary: System is overloaded
+ - alert: cappedTombstone
+ expr: changes(scylla_sstables_capped_tombstone_deletion_time[1h]) > 0
+ for: 1m
+ labels:
+ severity: "1"
+ advisor: "cappedTombstone"
+ dashboard: "scylla-detailed"
+ annotations:
+ description: 'Tombstone delete time was set too far in the future and was capped'
+ summary: Tobmstone delete time is capped
+ - alert: InstanceDown
+ expr: up{job="scylla"} == 0
+ for: 30s
+ labels:
+ severity: "2"
+ annotations:
+ description: '{{ $labels.instance }} has been down for more than 30 seconds.'
+ summary: Instance {{ $labels.instance }} down
+ - alert: InstanceDown
+ expr: absent(scylla_transport_requests_served{job="scylla", shard="0"})
+ for: 1m
+ labels:
+ severity: "2"
+ annotations:
+ description: '{{ $labels.instance }} instance is shutting down.'
+ summary: Instance {{ $labels.instance }} down
+ - alert: InstanceDown
+ expr: scylla_node_operation_mode > 3
+ for: 30s
+ labels:
+ severity: "2"
+ annotations:
+ description: '{{ $labels.instance }} instance is shutting down.'
+ summary: Instance {{ $labels.instance }} down
+ - alert: DiskFull
+ expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"}
+ * 100 < 35
+ for: 30s
+ labels:
+ severity: "2"
+ annotations:
+ description: '{{ $labels.instance }} has less than 35% free disk space.'
+ summary: Instance {{ $labels.instance }} low disk space
+ - alert: DiskFull
+ expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"}
+ * 100 < 25
+ for: 30s
+ labels:
+ severity: "3"
+ annotations:
+ description: '{{ $labels.instance }} has less than 25% free disk space.'
+ summary: Instance {{ $labels.instance }} low disk space
+ - alert: DiskFull
+ expr: node_filesystem_avail_bytes{mountpoint="/var/lib/scylla"} / node_filesystem_size_bytes{mountpoint="/var/lib/scylla"}
+ * 100 < 15
+ for: 30s
+ labels:
+ severity: "4"
+ annotations:
+ description: '{{ $labels.instance }} has less than 15% free disk space.'
+ summary: Instance {{ $labels.instance }} low disk space
+ - alert: DiskFull
+ expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
+ * 100 < 20
+ for: 30s
+ labels:
+ severity: "3"
+ annotations:
+ description: '{{ $labels.instance }} has less than 20% free disk space on the root partition.'
+ summary: Instance {{ $labels.instance }} low disk space
+ - alert: NoCql
+ expr: scylla_manager_healthcheck_cql_status == -1
+ for: 30s
+ labels:
+ severity: "2"
+ annotations:
+ description: '{{ $labels.host }} has denied CQL connection for more than 30 seconds.'
+ summary: Instance {{ $labels.host }} no CQL connection
+ - alert: HighLatencies
+ expr: wlatencyp95{by="instance"} > 100000
+ for: 5m
+ labels:
+ severity: "1"
+ annotations:
+ description: '{{ $labels.instance }} has 95% high latency for more than 5 minutes.'
+ summary: Instance {{ $labels.instance }} High Write Latency
+ - alert: HighLatencies
+ expr: wlatencya{by="instance"} >10000
+ for: 5m
+ labels:
+ severity: "1"
+ annotations:
+ description: '{{ $labels.instance }} has average high latency for more than 5 minutes.'
+ summary: Instance {{ $labels.instance }} High Write Latency
+ - alert: HighLatencies
+ expr: rlatencyp95{by="instance"} > 100000
+ for: 5m
+ labels:
+ severity: "1"
+ annotations:
+ description: '{{ $labels.instance }} has 95% high latency for more than 5 minutes.'
+ summary: Instance {{ $labels.instance }} High Read Latency
+ - alert: HighLatencies
+ expr: rlatencya{by="instance"} >10000
+ for: 5m
+ labels:
+ severity: "1"
+ annotations:
+ description: '{{ $labels.instance }} has average high latency for more than 5 minutes.'
+ summary: Instance {{ $labels.instance }} High Read Latency
+ - alert: BackupFailed
+ expr: (sum(scylla_manager_scheduler_run_total{type=~"backup", status="ERROR"}) or vector(0)) - (sum(scylla_manager_scheduler_run_total{type=~"backup", status="ERROR"} offset 3m) or vector(0)) > 0
+ for: 10s
+ labels:
+ severity: "1"
+ annotations:
+ description: 'Backup failed'
+ summary: Backup task failed
+ - alert: RepairFailed
+ expr: (sum(scylla_manager_scheduler_run_total{type=~"repair", status="ERROR"}) or vector(0)) - (sum(scylla_manager_scheduler_run_total{type=~"repair", status="ERROR"} offset 3m) or vector(0)) > 0
+ for: 10s
+ labels:
+ severity: "1"
+ annotations:
+ description: 'Repair failed'
+ summary: Repair task failed
+ - alert: restart
+ expr: resets(scylla_gossip_heart_beat[1h])>0
+ for: 10s
+ labels:
+ severity: "1"
+ annotations:
+ description: 'Node restarted'
+ summary: Instance {{ $labels.instance }} restarted
+ - alert: oomKill
+ expr: changes(node_vmstat_oom_kill[1h])>0
+ for: 10s
+ labels:
+ severity: "2"
+ annotations:
+ description: 'OOM Kill on {{ $labels.instance }}'
+ summary: A process was terminated on Instance {{ $labels.instance }}
+`}}
diff --git a/assets/monitoring/prometheus/v1/ingress.yaml b/assets/monitoring/prometheus/v1/ingress.yaml
new file mode 100644
index 00000000000..f318487fd94
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/ingress.yaml
@@ -0,0 +1,21 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus"
+ annotations:
+ {{- .ingressAnnotations | toYAML | nindent 4 }}
+spec:
+ ingressClassName: {{ or .ingressClassName "null" }}
+ rules:
+ {{- range $_, $dnsDomain := .dnsDomains }}
+ - host: "{{ $dnsDomain }}"
+ http:
+ paths:
+ - backend:
+ service:
+ name: "{{ $.scyllaDBMonitoringName }}-prometheus"
+ port:
+ number: 9090
+ path: /
+ pathType: Prefix
+ {{- end }}
diff --git a/assets/monitoring/prometheus/v1/prometheus.yaml b/assets/monitoring/prometheus/v1/prometheus.yaml
new file mode 100644
index 00000000000..7765d380f90
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/prometheus.yaml
@@ -0,0 +1,50 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}"
+spec:
+ serviceAccountName: "{{ .scyllaDBMonitoringName }}-prometheus"
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ fsGroup: 65534
+ web:
+ pageTitle: "ScyllaDB Prometheus"
+ tlsConfig:
+ cert:
+ secret:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-certs"
+ key: "tls.crt"
+ keySecret:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus-serving-certs"
+ key: "tls.key"
+# clientAuthType: "RequireAndVerifyClientCert"
+# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs
+ clientAuthType: "RequestClientCert"
+ client_ca:
+ configMap:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus-client-ca"
+ key: "ca-bundle.crt"
+ httpConfig:
+ http2: true
+ serviceMonitorSelector:
+ matchLabels: {}
+ affinity:
+ {{- .affinity | toYAML | nindent 4 }}
+ tolerations:
+ {{- .tolerations | toYAML | nindent 4 }}
+ resources:
+ {{- .resources | toYAML | nindent 4 }}
+ alerting:
+ alertmanagers:
+ - namespace: "{{ .namespace }}"
+ name: "{{ .scyllaDBMonitoringName }}"
+ port: web
+ ruleSelector:
+ matchLabels:
+ scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}"
+ {{- if .volumeClaimTemplate }}
+ storage:
+ volumeClaimTemplate:
+ {{- .volumeClaimTemplate | toYAML | nindent 6 }}
+ {{- end }}
diff --git a/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml b/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml
new file mode 100644
index 00000000000..3386f9d730a
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/recording.prometheusrule.yaml
@@ -0,0 +1,229 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-recoding"
+ labels:
+ scylla-operator.scylladb.com/scylladbmonitoring-name: "{{ .scyllaDBMonitoringName }}"
+spec:
+ groups:
+ - name: scylla.rules
+ rules:
+ - record: cql:all_shardrate1m
+ expr: sum(rate(scylla_cql_reads[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_cql_inserts[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_updates[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_deletes[60s])) by (cluster, dc, instance, shard)
+ - record: cql:all_system_shardrate1m
+ expr: sum(rate(scylla_cql_reads_per_ks{ks="system"}[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_cql_inserts_per_ks{ks="system"}[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_updates_per_ks{ks="system"}[60s]) ) by (cluster, dc, instance, shard) + sum( rate(scylla_cql_deletes_per_ks{ks="system"}[60s])) by (cluster, dc, instance, shard)
+ - record: cql:local_shardrate1m
+ expr: sum(rate(scylla_storage_proxy_coordinator_reads_local_node[60s])) by (cluster, dc, instance, shard) + sum(rate(scylla_storage_proxy_coordinator_total_write_attempts_local_node[60s]) ) by (cluster, dc, instance, shard)
+ - record: cql:all_rate1m
+ expr: sum(cql:all_shardrate1m) by (cluster, dc, instance)
+ - record: cql:non_token_aware
+ expr: (sum(cql:all_rate1m) by (cluster) >bool 100) * clamp_min(1-(sum(cql:local_shardrate1m) by (cluster) / sum(cql:all_rate1m) by (cluster)), 0)
+ - record: cql:non_system_prepared1m
+ expr: clamp_min(sum(rate(scylla_query_processor_statements_prepared[1m])) by (cluster, dc, instance, shard) - cql:all_system_shardrate1m, 0)
+ - record: cql:non_prepared
+ expr: (sum(cql:non_system_prepared1m) by (cluster) >bool 100) * (sum(cql:non_system_prepared1m) by (cluster) / clamp_min(sum(cql:all_rate1m) by (cluster)- sum(cql:all_system_shardrate1m) by (cluster), 0.001))
+ - record: cql:non_paged_no_system1m
+ expr: clamp_min(sum(rate(scylla_cql_unpaged_select_queries[60s])) by (cluster, dc, instance) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster, dc, instance), 0)
+ - record: cql:non_paged_no_system
+ expr: (sum(cql:non_paged_no_system1m) by (cluster, dc, instance) >bool 100) * sum(cql:non_paged_no_system) by (cluster, dc, instance)/clamp_min(sum(rate(scylla_cql_reads[60s]))by (cluster, dc, instance) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster, dc, instance), 0.01)
+ - record: cql:non_paged
+ expr: (sum(cql:non_paged_no_system1m) by (cluster) >bool 100) * sum(cql:non_paged_no_system1m) by (cluster)/clamp_min(sum(rate(scylla_cql_reads[60s]))by (cluster) - sum(rate(scylla_cql_unpaged_select_queries_per_ks{ks="system"}[60s])) by (cluster), 0.01)
+ - record: cql:reverse_queries
+ expr: sum(rate(scylla_cql_reverse_queries[60s])) by (cluster)/ sum(rate(scylla_cql_reads[60s])) by (cluster)
+ - record: cql:allow_filtering
+ expr: sum(rate(scylla_cql_filtered_read_requests[60s])) by (cluster)/ sum(rate(scylla_cql_reads[60s])) by (cluster)
+ - record: cql:any_queries
+ expr: sum(rate(scylla_query_processor_queries{consistency_level="ANY"}[60s])) by (cluster) >bool 0
+ - record: cql:all_queries
+ expr: sum(rate(scylla_query_processor_queries{consistency_level="ALL"}[60s])) by (cluster) >bool 0
+ - record: errors:operation_unavailable
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_unavailable[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_write_unavailable[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_range_unavailable[60s])) by (cluster, dc, instance)
+ - record: errors:local_failed
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_errors_local_node[60s])) by (cluster, dc, instance) + sum(rate(scylla_storage_proxy_coordinator_write_errors_local_node[60s])) by (cluster, dc, instance)
+ - record: errors:nodes_total
+ expr: errors:local_failed + errors:operation_unavailable
+ - record: manager:repair_done_ts
+ expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="repair"}[60s])) by (cluster) > 0) or on(cluster) manager:repair_done_ts
+ - record: manager:backup_done_ts
+ expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="backup"}[60s])) by (cluster) > 0) or on(cluster) manager:backup_done_ts
+ - record: manager:repair_fail_ts
+ expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="repair"}[60s])) by (cluster) > 0) or on(cluster) manager:repair_fail_ts
+ - record: manager:backup_fail_ts
+ expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="backup"}[60s])) by (cluster) > 0) or on(cluster) manager:backup_fail_ts
+ - record: manager:repair_progress
+ expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster))
+ - record: scylla_manager_repair_progress
+ expr: sum(manager:repair_progress) by (cluster)
+ labels:
+ level: "1"
+ by: "cluster"
+ - record: manager:backup_progress
+ expr: (max(scylla_manager_scheduler_run_indicator{type="backup"}) by (cluster) >bool 0)*((max(scylla_manager_backup_files_size_bytes) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_backup_files_uploaded_bytes) by (cluster) + sum(scylla_manager_backup_files_skipped_bytes) by (cluster) + sum(scylla_manager_backup_files_failed_bytes)by(cluster))/sum(scylla_manager_backup_files_size_bytes>=0) by (cluster))
+ - record: scylla_manager_backup_progress
+ expr: sum(manager:backup_progress) by (cluster)
+ labels:
+ level: "1"
+ by: "cluster"
+ - record: wlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: wlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: wlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: wlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: rlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: rlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: rlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: rlatencyp99
+ expr: histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: wlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: wlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: wlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: wlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: rlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name, le))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: rlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, instance, scheduling_group_name, le))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: rlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, dc, scheduling_group_name, le))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: rlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{}[60s])) by (cluster, scheduling_group_name, le))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: wlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name, shard)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, dc, instance, scheduling_group_name, shard)
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: wlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, dc, scheduling_group_name, instance)
+ labels:
+ by: "instance"
+ level: "1"
+ - record: wlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster, dc,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, scheduling_group_name, dc)
+ labels:
+ by: "dc"
+ level: "1"
+ - record: wlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{}[60s])) by (cluster,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_write_latency_count{}[60s])) by (cluster, scheduling_group_name)
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: rlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc, instance, shard,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, instance, shard, scheduling_group_name)
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: rlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc, instance,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, instance, scheduling_group_name)
+ labels:
+ by: "instance"
+ level: "1"
+ - record: rlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster, dc,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, dc, scheduling_group_name)
+ labels:
+ by: "dc"
+ level: "1"
+ - record: rlatencya
+ expr: sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{}[60s])) by (cluster,scheduling_group_name)/sum(rate(scylla_storage_proxy_coordinator_read_latency_count{}[60s])) by (cluster, scheduling_group_name)
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: casrlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: casrlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, instance, le, scheduling_group_name))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: casrlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, dc, le, scheduling_group_name))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: casrlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_read_latency_bucket{}[60s])) by (cluster, le, scheduling_group_name))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: caswlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, instance, shard, le, scheduling_group_name))
+ labels:
+ by: "instance,shard"
+ level: "1"
+ - record: caswlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, instance, le, scheduling_group_name))
+ labels:
+ by: "instance"
+ level: "1"
+ - record: caswlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, dc, le, scheduling_group_name))
+ labels:
+ by: "dc"
+ level: "1"
+ - record: caswlatencyp95
+ expr: histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_cas_write_latency_bucket{}[60s])) by (cluster, le, scheduling_group_name))
+ labels:
+ by: "cluster"
+ level: "1"
+ - record: all_scheduling_group
+ expr: sum(scylla_storage_proxy_coordinator_write_latency_count>0) by (cluster, scheduling_group_name)
+
diff --git a/assets/monitoring/prometheus/v1/registry.go b/assets/monitoring/prometheus/v1/registry.go
new file mode 100644
index 00000000000..f8f76077cfc
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/registry.go
@@ -0,0 +1,51 @@
+package v1
+
+import (
+ _ "embed"
+
+ "github.com/scylladb/scylla-operator/pkg/assets"
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ "github.com/scylladb/scylla-operator/pkg/scheme"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ rbacv1 "k8s.io/api/rbac/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+)
+
+func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] {
+ return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer())
+}
+
+var (
+ //go:embed "prometheus.yaml"
+ prometheusTemplateString string
+ PrometheusTemplate = ParseObjectTemplateOrDie[*monitoringv1.Prometheus]("prometheus", prometheusTemplateString)
+
+ //go:embed "serviceaccount.yaml"
+ prometheusSATemplateString string
+ PrometheusSATemplate = ParseObjectTemplateOrDie[*corev1.ServiceAccount]("prometheus-sa", prometheusSATemplateString)
+
+ //go:embed "rolebinding.yaml"
+ prometheusRoleBindingTemplateString string
+ PrometheusRoleBindingTemplate = ParseObjectTemplateOrDie[*rbacv1.RoleBinding]("prometheus-rolebinding", prometheusRoleBindingTemplateString)
+
+ //go:embed "service.yaml"
+ prometheusServiceTemplateString string
+ PrometheusServiceTemplate = ParseObjectTemplateOrDie[*corev1.Service]("prometheus-service", prometheusServiceTemplateString)
+
+ //go:embed "scylladb.servicemonitor.yaml"
+ scyllaDBServiceMonitorTemplateString string
+ ScyllaDBServiceMonitorTemplate = ParseObjectTemplateOrDie[*monitoringv1.ServiceMonitor]("scylladb-servicemonitor", scyllaDBServiceMonitorTemplateString)
+
+ //go:embed "recording.prometheusrule.yaml"
+ recordingPrometheusRuleTemplateString string
+ RecordingPrometheusRuleTemplate = ParseObjectTemplateOrDie[*monitoringv1.PrometheusRule]("recording-prometheus-rule", recordingPrometheusRuleTemplateString)
+
+ //go:embed "alerts.prometheusrule.yaml"
+ alertsPrometheusRuleTemplateString string
+ AlertsPrometheusRuleTemplate = ParseObjectTemplateOrDie[*monitoringv1.PrometheusRule]("alerts-prometheus-rule", alertsPrometheusRuleTemplateString)
+
+ //go:embed "ingress.yaml"
+ prometheusIngressTemplateString string
+ PrometheusIngressTemplate = ParseObjectTemplateOrDie[*networkingv1.Ingress]("prometheus-ingress", prometheusIngressTemplateString)
+)
diff --git a/assets/monitoring/prometheus/v1/rolebinding.yaml b/assets/monitoring/prometheus/v1/rolebinding.yaml
new file mode 100644
index 00000000000..fdd6fb46119
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/rolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus"
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: scylladb:monitoring:prometheus
+subjects:
+- kind: ServiceAccount
+ name: "{{ .scyllaDBMonitoringName }}-prometheus"
+ namespace: "{{ .namespace }}"
diff --git a/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml b/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml
new file mode 100644
index 00000000000..92daafa20e8
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/scylladb.servicemonitor.yaml
@@ -0,0 +1,74 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-scylladb"
+spec:
+ selector:
+ {{- .endpointsSelector | toYAML | nindent 4 }}
+ jobLabel: scylla/cluster
+ endpoints:
+ - port: node-exporter
+ honorLabels: false
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):\d+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__address__]
+ regex: '([^:]+)'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [instance]
+ regex: '(.*)'
+ targetLabel: __address__
+ replacement: '${1}:9100'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
+ - port: prometheus
+ honorLabels: false
+ metricRelabelings:
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CPU
+ replacement: 'cpu'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CQL
+ replacement: 'cql'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: OS
+ replacement: 'os'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: IO
+ replacement: 'io'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: Errors
+ replacement: 'errors'
+ - regex: 'help|exported_instance'
+ action: labeldrop
+ - sourceLabels: [version]
+ regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*'
+ replacement: '$1$2'
+ targetLabel: svr
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):.+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
diff --git a/assets/monitoring/prometheus/v1/service.yaml b/assets/monitoring/prometheus/v1/service.yaml
new file mode 100644
index 00000000000..d6ddf3f15f6
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus"
+spec:
+ type: ClusterIP
+ ports:
+ - name: web
+ port: 9090
+ protocol: TCP
+ targetPort: web
+ selector:
+ app.kubernetes.io/name: prometheus
+ app.kubernetes.io/instance: "{{ .scyllaDBMonitoringName }}"
diff --git a/assets/monitoring/prometheus/v1/serviceaccount.yaml b/assets/monitoring/prometheus/v1/serviceaccount.yaml
new file mode 100644
index 00000000000..952477d15c5
--- /dev/null
+++ b/assets/monitoring/prometheus/v1/serviceaccount.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: "{{ .scyllaDBMonitoringName }}-prometheus"
diff --git a/examples/monitoring/v1alpha1/scylladbmonitoring.yaml b/examples/monitoring/v1alpha1/scylladbmonitoring.yaml
new file mode 100644
index 00000000000..86d8b699f5b
--- /dev/null
+++ b/examples/monitoring/v1alpha1/scylladbmonitoring.yaml
@@ -0,0 +1,28 @@
+apiVersion: scylla.scylladb.com/v1alpha1
+kind: ScyllaDBMonitoring
+metadata:
+ name: example
+spec:
+ endpointsSelector:
+ matchLabels:
+ app.kubernetes.io/name: scylla
+ scylla-operator.scylladb.com/scylla-service-type: identity
+ scylla/cluster: replace-with-your-scyllacluster-name
+ components:
+ prometheus:
+ storage:
+ volumeClaimTemplate:
+ spec:
+ resources:
+ requests:
+ storage: 1Gi
+ grafana:
+ exposeOptions:
+ webInterface:
+ ingress:
+ ingressClassName: haproxy
+ dnsDomains:
+ - test-grafana.test.svc.cluster.local
+ annotations:
+ haproxy-ingress.github.io/ssl-passthrough: "true"
+
diff --git a/hack/ci-deploy.sh b/hack/ci-deploy.sh
index c0f9cace888..7afbbbbbca9 100755
--- a/hack/ci-deploy.sh
+++ b/hack/ci-deploy.sh
@@ -80,4 +80,5 @@ kubectl -n haproxy-ingress rollout status --timeout=5m deployment.apps/haproxy-i
kubectl wait --for condition=established crd/nodeconfigs.scylla.scylladb.com
kubectl wait --for condition=established crd/scyllaoperatorconfigs.scylla.scylladb.com
+kubectl wait --for condition=established crd/scylladbmonitorings.scylla.scylladb.com
kubectl wait --for condition=established $( find "${deploy_dir}/prometheus-operator/" -name '*.crd.yaml' -printf '-f=%p\n' )
diff --git a/helm/scylla-operator/templates/clusterrole_def.yaml b/helm/scylla-operator/templates/clusterrole_def.yaml
index e766be874d2..d4a0c2c0ed9 100644
--- a/helm/scylla-operator/templates/clusterrole_def.yaml
+++ b/helm/scylla-operator/templates/clusterrole_def.yaml
@@ -17,6 +17,7 @@ rules:
- ""
resources:
- nodes
+ - endpoints
verbs:
- get
- list
@@ -65,6 +66,7 @@ rules:
- list
- watch
- create
+ - delete
- update
- patch
- apiGroups:
@@ -83,14 +85,16 @@ rules:
- apps
resources:
- statefulsets
+ - daemonsets
+ - deployments
verbs:
- create
- - delete
- get
- list
- - patch
- - update
- watch
+ - update
+ - patch
+ - delete
- apiGroups:
- apps
resources:
@@ -101,6 +105,7 @@ rules:
- scylla.scylladb.com
resources:
- scyllaclusters
+ - scylladbmonitorings
verbs:
- create
- delete
@@ -113,6 +118,7 @@ rules:
- scylla.scylladb.com
resources:
- scyllaclusters/status
+ - scylladbmonitorings/status
verbs:
- get
- list
@@ -155,18 +161,6 @@ rules:
- patch
- update
- watch
-- apiGroups:
- - apps
- resources:
- - daemonsets
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
- apiGroups:
- scylla.scylladb.com
resources:
@@ -265,3 +259,17 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - monitoring.coreos.com
+ resources:
+ - prometheuses
+ - prometheusrules
+ - servicemonitors
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - patch
+ - update
+ - delete
diff --git a/helm/scylla-operator/templates/edit_clusterrole.yaml b/helm/scylla-operator/templates/edit_clusterrole.yaml
index b2875778341..8e34147109e 100644
--- a/helm/scylla-operator/templates/edit_clusterrole.yaml
+++ b/helm/scylla-operator/templates/edit_clusterrole.yaml
@@ -10,6 +10,7 @@ rules:
- scylla.scylladb.com
resources:
- scyllaclusters
+ - scylladbmonitorings
verbs:
- create
- patch
diff --git a/helm/scylla-operator/templates/view_clusterrole.yaml b/helm/scylla-operator/templates/view_clusterrole.yaml
index b5cc095aaef..44134b6dabe 100644
--- a/helm/scylla-operator/templates/view_clusterrole.yaml
+++ b/helm/scylla-operator/templates/view_clusterrole.yaml
@@ -11,6 +11,7 @@ rules:
- scylla.scylladb.com
resources:
- scyllaclusters
+ - scylladbmonitorings
verbs:
- get
- list
diff --git a/pkg/assets/decode.go b/pkg/assets/decode.go
new file mode 100644
index 00000000000..8507aa64a72
--- /dev/null
+++ b/pkg/assets/decode.go
@@ -0,0 +1,49 @@
+// Copyright (C) 2023 ScyllaDB
+
+package assets
+
+import (
+ "fmt"
+ "text/template"
+
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/klog/v2"
+)
+
+func Decode[T any](data []byte, decoder runtime.Decoder) (T, error) {
+ obj, _, err := decoder.Decode(data, nil, nil)
+ if err != nil {
+ return *new(T), fmt.Errorf("can't decode object: %w", err)
+ }
+
+ typedObj, ok := obj.(T)
+ if !ok {
+ return *new(T), fmt.Errorf("can't cast decoded object of type %t: %w", obj, err)
+ }
+
+ return typedObj, nil
+}
+
+func RenderAndDecode[T runtime.Object](tmpl *template.Template, inputs any, decoder runtime.Decoder) (T, string, error) {
+ renderedBytes, err := RenderTemplate(tmpl, inputs)
+ if err != nil {
+ return *new(T), "", fmt.Errorf("can't render template: %w", err)
+ }
+
+ obj, err := Decode[T](renderedBytes, decoder)
+ if err != nil {
+ // Rendered templates can contain secret data that we can't log in the regular flow.
+ var redactedString string
+ switch runtime.Object(*new(T)).(type) {
+ case *corev1.Secret:
+ redactedString = ""
+ default:
+ redactedString = string(renderedBytes)
+ }
+ klog.Errorf("Can't decode rendered template %q: %v. Template:\n%s", tmpl.Name(), err, redactedString)
+ return *new(T), string(renderedBytes), fmt.Errorf("can't decode rendered template %q: %w", tmpl.Name(), err)
+ }
+
+ return obj, string(renderedBytes), nil
+}
diff --git a/pkg/assets/object_template.go b/pkg/assets/object_template.go
new file mode 100644
index 00000000000..0232c0917ed
--- /dev/null
+++ b/pkg/assets/object_template.go
@@ -0,0 +1,34 @@
+package assets
+
+import (
+ "fmt"
+ "text/template"
+
+ "github.com/scylladb/scylla-operator/pkg/helpers"
+ "k8s.io/apimachinery/pkg/runtime"
+)
+
+type ObjectTemplate[T runtime.Object] struct {
+ tmpl *template.Template
+ decoder runtime.Decoder
+}
+
+func ParseObjectTemplate[T runtime.Object](name, tmplString string, funcMap template.FuncMap, decoder runtime.Decoder) (ObjectTemplate[T], error) {
+ tmpl, err := template.New(name).Funcs(funcMap).Parse(tmplString)
+ if err != nil {
+ return *new(ObjectTemplate[T]), fmt.Errorf("can't parse template %q: %w", name, err)
+ }
+
+ return ObjectTemplate[T]{
+ tmpl: tmpl,
+ decoder: decoder,
+ }, nil
+}
+
+func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string, funcMap template.FuncMap, decoder runtime.Decoder) ObjectTemplate[T] {
+ return helpers.Must(ParseObjectTemplate[T](name, tmplString, funcMap, decoder))
+}
+
+func (t *ObjectTemplate[T]) RenderObject(inputs any) (T, string, error) {
+ return RenderAndDecode[T](t.tmpl, inputs, t.decoder)
+}
diff --git a/pkg/assets/template.go b/pkg/assets/template.go
index c6784b2e7af..2c15acbb0ec 100644
--- a/pkg/assets/template.go
+++ b/pkg/assets/template.go
@@ -2,16 +2,50 @@ package assets
import (
"bytes"
+ "encoding/base64"
"fmt"
+ "strings"
"text/template"
+
+ "sigs.k8s.io/yaml"
)
-func RenderTemplate(tmpl *template.Template, data any) ([]byte, error) {
+var TemplateFuncs template.FuncMap = template.FuncMap{
+ "toYAML": marshalYAML,
+ "indent": indent,
+ "nindent": nindent,
+ "toBytes": toBytes,
+ "toBase64": toBase64,
+}
+
+func marshalYAML(v any) (string, error) {
+ bytes, err := yaml.Marshal(v)
+ return strings.TrimSpace(string(bytes)), err
+}
+
+func indent(spaceCount int, s string) string {
+ spaces := strings.Repeat(" ", spaceCount)
+ return spaces + strings.Replace(s, "\n", "\n"+spaces, -1)
+}
+
+func nindent(spaceCount int, s string) string {
+ return "\n" + indent(spaceCount, s)
+}
+
+func toBytes(s string) []byte {
+ return []byte(s)
+}
+
+func toBase64(data []byte) string {
+ return base64.StdEncoding.EncodeToString(data)
+}
+
+func RenderTemplate(tmpl *template.Template, inputs any) ([]byte, error) {
// We always want correctness. (Accidentally missing a key might have side effects.)
tmpl.Option("missingkey=error")
var buf bytes.Buffer
- err := tmpl.Execute(&buf, data)
+ err := tmpl.Execute(&buf, inputs)
if err != nil {
return nil, fmt.Errorf("can't execute template %q: %w", tmpl.Name(), err)
}
diff --git a/pkg/cmd/operator/operator.go b/pkg/cmd/operator/operator.go
index 2b9570a9b22..cd3fd391377 100644
--- a/pkg/cmd/operator/operator.go
+++ b/pkg/cmd/operator/operator.go
@@ -12,7 +12,10 @@ import (
"github.com/scylladb/scylla-operator/pkg/controller/nodeconfigpod"
"github.com/scylladb/scylla-operator/pkg/controller/orphanedpv"
"github.com/scylladb/scylla-operator/pkg/controller/scyllacluster"
+ "github.com/scylladb/scylla-operator/pkg/controller/scylladbmonitoring"
"github.com/scylladb/scylla-operator/pkg/controller/scyllaoperatorconfig"
+ monitoringversionedclient "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned"
+ monitoringinformers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/informers/externalversions"
"github.com/scylladb/scylla-operator/pkg/genericclioptions"
"github.com/scylladb/scylla-operator/pkg/leaderelection"
"github.com/scylladb/scylla-operator/pkg/naming"
@@ -34,8 +37,9 @@ type OperatorOptions struct {
genericclioptions.InClusterReflection
genericclioptions.LeaderElection
- kubeClient kubernetes.Interface
- scyllaClient scyllaversionedclient.Interface
+ kubeClient kubernetes.Interface
+ scyllaClient scyllaversionedclient.Interface
+ monitoringClient monitoringversionedclient.Interface
ConcurrentSyncs int
OperatorImage string
@@ -140,6 +144,11 @@ func (o *OperatorOptions) Complete() error {
return fmt.Errorf("can't build scylla clientset: %w", err)
}
+ o.monitoringClient, err = monitoringversionedclient.NewForConfig(o.RestConfig)
+ if err != nil {
+ return fmt.Errorf("can't build monitoring clientset: %w", err)
+ }
+
return nil
}
@@ -183,6 +192,8 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
},
))
+ monitoringInformers := monitoringinformers.NewSharedInformerFactory(o.monitoringClient, resyncPeriod)
+
scc, err := scyllacluster.NewController(
o.kubeClient,
o.scyllaClient.ScyllaV1(),
@@ -200,7 +211,7 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
o.CQLSIngressPort,
)
if err != nil {
- return err
+ return fmt.Errorf("can't create scyllacluster controller: %w", err)
}
opc, err := orphanedpv.NewController(
@@ -211,7 +222,7 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
scyllaInformers.Scylla().V1().ScyllaClusters(),
)
if err != nil {
- return err
+ return fmt.Errorf("can't create orphanpv controller: %w", err)
}
ncc, err := nodeconfig.NewController(
@@ -227,6 +238,9 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
kubeInformers.Core().V1().ServiceAccounts(),
o.OperatorImage,
)
+ if err != nil {
+ return fmt.Errorf("can't create nodeconfig controller: %w", err)
+ }
ncpc, err := nodeconfigpod.NewController(
o.kubeClient,
@@ -236,12 +250,39 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
kubeInformers.Core().V1().Nodes(),
scyllaInformers.Scylla().V1alpha1().NodeConfigs(),
)
+ if err != nil {
+ return fmt.Errorf("can't create nodeconfigpod controller: %w", err)
+ }
socc, err := scyllaoperatorconfig.NewController(
o.kubeClient,
o.scyllaClient.ScyllaV1alpha1(),
scyllaOperatorConfigInformers.Scylla().V1alpha1().ScyllaOperatorConfigs(),
)
+ if err != nil {
+ return fmt.Errorf("can't create scyllaoperatorconfig controller: %w", err)
+ }
+
+ mc, err := scylladbmonitoring.NewController(
+ o.kubeClient,
+ o.scyllaClient.ScyllaV1alpha1(),
+ o.monitoringClient.MonitoringV1(),
+ kubeInformers.Core().V1().ConfigMaps(),
+ kubeInformers.Core().V1().Secrets(),
+ kubeInformers.Core().V1().Services(),
+ kubeInformers.Core().V1().ServiceAccounts(),
+ kubeInformers.Rbac().V1().RoleBindings(),
+ kubeInformers.Policy().V1().PodDisruptionBudgets(),
+ kubeInformers.Apps().V1().Deployments(),
+ kubeInformers.Networking().V1().Ingresses(),
+ scyllaInformers.Scylla().V1alpha1().ScyllaDBMonitorings(),
+ monitoringInformers.Monitoring().V1().Prometheuses(),
+ monitoringInformers.Monitoring().V1().PrometheusRules(),
+ monitoringInformers.Monitoring().V1().ServiceMonitors(),
+ )
+ if err != nil {
+ return fmt.Errorf("can't create scylladbmonitoring controller: %w", err)
+ }
var wg sync.WaitGroup
defer wg.Wait()
@@ -264,6 +305,12 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
scyllaOperatorConfigInformers.Start(ctx.Done())
}()
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ monitoringInformers.Start(ctx.Done())
+ }()
+
wg.Add(1)
go func() {
defer wg.Done()
@@ -294,6 +341,12 @@ func (o *OperatorOptions) run(ctx context.Context, streams genericclioptions.IOS
socc.Run(ctx, o.ConcurrentSyncs)
}()
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ mc.Run(ctx, o.ConcurrentSyncs)
+ }()
+
<-ctx.Done()
return nil
diff --git a/pkg/controller/scylladbmonitoring/conditions.go b/pkg/controller/scylladbmonitoring/conditions.go
new file mode 100644
index 00000000000..c7e6ffb189c
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/conditions.go
@@ -0,0 +1,8 @@
+package scylladbmonitoring
+
+const (
+ prometheusControllerProgressingCondition = "PrometheusControllerProgressing"
+ prometheusControllerDegradedCondition = "PrometheusControllerDegraded"
+ grafanaControllerProgressingCondition = "GrafanaControllerProgressing"
+ grafanaControllerDegradedCondition = "GrafanaControllerDegraded"
+)
diff --git a/pkg/controller/scylladbmonitoring/controller.go b/pkg/controller/scylladbmonitoring/controller.go
new file mode 100644
index 00000000000..762061949ee
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/controller.go
@@ -0,0 +1,560 @@
+package scylladbmonitoring
+
+import (
+ "context"
+ "fmt"
+ "sync"
+ "time"
+
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ scyllav1alpha1client "github.com/scylladb/scylla-operator/pkg/client/scylla/clientset/versioned/typed/scylla/v1alpha1"
+ scyllav1alpha1informers "github.com/scylladb/scylla-operator/pkg/client/scylla/informers/externalversions/scylla/v1alpha1"
+ scyllav1alpha1listers "github.com/scylladb/scylla-operator/pkg/client/scylla/listers/scylla/v1alpha1"
+ "github.com/scylladb/scylla-operator/pkg/controllerhelpers"
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ monitoringv1client "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned/typed/monitoring/v1"
+ monitoringv1informers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/informers/externalversions/monitoring/v1"
+ monitoringv1listers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/listers/monitoring/v1"
+ "github.com/scylladb/scylla-operator/pkg/kubeinterfaces"
+ "github.com/scylladb/scylla-operator/pkg/scheme"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ policyv1 "k8s.io/api/policy/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ "k8s.io/apimachinery/pkg/labels"
+ utilerrors "k8s.io/apimachinery/pkg/util/errors"
+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+ "k8s.io/apimachinery/pkg/util/wait"
+ appsv1informers "k8s.io/client-go/informers/apps/v1"
+ corev1informers "k8s.io/client-go/informers/core/v1"
+ networkingv1informers "k8s.io/client-go/informers/networking/v1"
+ policyv1informers "k8s.io/client-go/informers/policy/v1"
+ rbacv1informers "k8s.io/client-go/informers/rbac/v1"
+ "k8s.io/client-go/kubernetes"
+ corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
+ appsv1listers "k8s.io/client-go/listers/apps/v1"
+ corev1listers "k8s.io/client-go/listers/core/v1"
+ networkingv1listers "k8s.io/client-go/listers/networking/v1"
+ policyv1listers "k8s.io/client-go/listers/policy/v1"
+ rbacv1listers "k8s.io/client-go/listers/rbac/v1"
+ "k8s.io/client-go/tools/cache"
+ "k8s.io/client-go/tools/record"
+ "k8s.io/client-go/util/workqueue"
+ "k8s.io/component-base/metrics/prometheus/ratelimiter"
+ "k8s.io/klog/v2"
+)
+
+const (
+ ControllerName = "ScyllaDBMonitoringController"
+)
+
+var (
+ keyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
+ scylladbMonitoringControllerGVK = scyllav1alpha1.GroupVersion.WithKind("ScyllaDBMonitoring")
+)
+
+type Controller struct {
+ kubeClient kubernetes.Interface
+ scyllaV1alpha1Client scyllav1alpha1client.ScyllaV1alpha1Interface
+ monitoringClient monitoringv1client.MonitoringV1Interface
+
+ configMapLister corev1listers.ConfigMapLister
+ secretLister corev1listers.SecretLister
+ serviceLister corev1listers.ServiceLister
+ serviceAccountLister corev1listers.ServiceAccountLister
+ roleBindingLister rbacv1listers.RoleBindingLister
+ pdbLister policyv1listers.PodDisruptionBudgetLister
+ deploymentLister appsv1listers.DeploymentLister
+ ingressLister networkingv1listers.IngressLister
+
+ scylladbMonitoringLister scyllav1alpha1listers.ScyllaDBMonitoringLister
+
+ prometheusLister monitoringv1listers.PrometheusLister
+ prometheusRuleLister monitoringv1listers.PrometheusRuleLister
+ serviceMonitorLister monitoringv1listers.ServiceMonitorLister
+
+ cachesToSync []cache.InformerSynced
+
+ eventRecorder record.EventRecorder
+
+ queue workqueue.RateLimitingInterface
+ handlers *controllerhelpers.Handlers[*scyllav1alpha1.ScyllaDBMonitoring]
+}
+
+func NewController(
+ kubeClient kubernetes.Interface,
+ scyllaV1alpha1Client scyllav1alpha1client.ScyllaV1alpha1Interface,
+ monitoringClient monitoringv1client.MonitoringV1Interface,
+ configMapInformer corev1informers.ConfigMapInformer,
+ secretInformer corev1informers.SecretInformer,
+ serviceInformer corev1informers.ServiceInformer,
+ serviceAccountInformer corev1informers.ServiceAccountInformer,
+ roleBindingInformer rbacv1informers.RoleBindingInformer,
+ pdbInformer policyv1informers.PodDisruptionBudgetInformer,
+ deploymentInformer appsv1informers.DeploymentInformer,
+ ingressInformer networkingv1informers.IngressInformer,
+ scyllaDBMonitoringInformer scyllav1alpha1informers.ScyllaDBMonitoringInformer,
+ prometheusInformer monitoringv1informers.PrometheusInformer,
+ prometheusRuleInformer monitoringv1informers.PrometheusRuleInformer,
+ serviceMonitorInformer monitoringv1informers.ServiceMonitorInformer,
+) (*Controller, error) {
+ eventBroadcaster := record.NewBroadcaster()
+ eventBroadcaster.StartStructuredLogging(0)
+ eventBroadcaster.StartRecordingToSink(&corev1client.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
+
+ if kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
+ err := ratelimiter.RegisterMetricAndTrackRateLimiterUsage(
+ "scylladbmonitoring_controller",
+ kubeClient.CoreV1().RESTClient().GetRateLimiter(),
+ )
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ smc := &Controller{
+ kubeClient: kubeClient,
+ scyllaV1alpha1Client: scyllaV1alpha1Client,
+ monitoringClient: monitoringClient,
+
+ secretLister: secretInformer.Lister(),
+ configMapLister: configMapInformer.Lister(),
+ serviceLister: serviceInformer.Lister(),
+ serviceAccountLister: serviceAccountInformer.Lister(),
+ roleBindingLister: roleBindingInformer.Lister(),
+ pdbLister: pdbInformer.Lister(),
+ deploymentLister: deploymentInformer.Lister(),
+ ingressLister: ingressInformer.Lister(),
+
+ scylladbMonitoringLister: scyllaDBMonitoringInformer.Lister(),
+
+ prometheusLister: prometheusInformer.Lister(),
+ prometheusRuleLister: prometheusRuleInformer.Lister(),
+ serviceMonitorLister: serviceMonitorInformer.Lister(),
+
+ cachesToSync: []cache.InformerSynced{
+ secretInformer.Informer().HasSynced,
+ configMapInformer.Informer().HasSynced,
+ serviceInformer.Informer().HasSynced,
+ serviceAccountInformer.Informer().HasSynced,
+ roleBindingInformer.Informer().HasSynced,
+ pdbInformer.Informer().HasSynced,
+ deploymentInformer.Informer().HasSynced,
+ ingressInformer.Informer().HasSynced,
+
+ scyllaDBMonitoringInformer.Informer().HasSynced,
+
+ prometheusInformer.Informer().HasSynced,
+ prometheusRuleInformer.Informer().HasSynced,
+ serviceMonitorInformer.Informer().HasSynced,
+ },
+
+ eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "scylladbmonitoring-controller"}),
+
+ queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "scylladbmonitoring"),
+ }
+
+ var err error
+ smc.handlers, err = controllerhelpers.NewHandlers[*scyllav1alpha1.ScyllaDBMonitoring](
+ smc.queue,
+ keyFunc,
+ scheme.Scheme,
+ scylladbMonitoringControllerGVK,
+ kubeinterfaces.NamespacedGetList[*scyllav1alpha1.ScyllaDBMonitoring]{
+ GetFunc: func(namespace, name string) (*scyllav1alpha1.ScyllaDBMonitoring, error) {
+ return smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).Get(name)
+ },
+ ListFunc: func(namespace string, selector labels.Selector) (ret []*scyllav1alpha1.ScyllaDBMonitoring, err error) {
+ return smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).List(selector)
+ },
+ },
+ )
+ if err != nil {
+ return nil, fmt.Errorf("can't create handlers: %w", err)
+ }
+
+ scyllaDBMonitoringInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addScyllaDBMonitoring,
+ UpdateFunc: smc.updateScyllaDBMonitoring,
+ DeleteFunc: smc.deleteScyllaDBMonitoring,
+ })
+
+ configMapInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addConfigMap,
+ UpdateFunc: smc.updateConfigMap,
+ DeleteFunc: smc.deleteConfigMap,
+ })
+
+ secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addSecret,
+ UpdateFunc: smc.updateSecret,
+ DeleteFunc: smc.deleteSecret,
+ })
+
+ serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addService,
+ UpdateFunc: smc.updateService,
+ DeleteFunc: smc.deleteService,
+ })
+
+ serviceAccountInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addServiceAccount,
+ UpdateFunc: smc.updateServiceAccount,
+ DeleteFunc: smc.deleteServiceAccount,
+ })
+
+ pdbInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addPodDisruptionBudget,
+ UpdateFunc: smc.updatePodDisruptionBudget,
+ DeleteFunc: smc.deletePodDisruptionBudget,
+ })
+
+ deploymentInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addDeployment,
+ UpdateFunc: smc.updateDeployment,
+ DeleteFunc: smc.deleteDeployment,
+ })
+
+ ingressInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addIngress,
+ UpdateFunc: smc.updateIngress,
+ DeleteFunc: smc.deleteIngress,
+ })
+
+ prometheusInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addPrometheus,
+ UpdateFunc: smc.updatePrometheus,
+ DeleteFunc: smc.deletePrometheus,
+ })
+
+ prometheusRuleInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addPrometheusRule,
+ UpdateFunc: smc.updatePrometheusRule,
+ DeleteFunc: smc.deletePrometheusRule,
+ })
+
+ serviceMonitorInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: smc.addServiceMonitor,
+ UpdateFunc: smc.updateServiceMonitor,
+ DeleteFunc: smc.deleteServiceMonitor,
+ })
+
+ return smc, nil
+}
+
+func (smc *Controller) addScyllaDBMonitoring(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*scyllav1alpha1.ScyllaDBMonitoring),
+ smc.handlers.Enqueue,
+ )
+}
+
+func (smc *Controller) updateScyllaDBMonitoring(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*scyllav1alpha1.ScyllaDBMonitoring),
+ cur.(*scyllav1alpha1.ScyllaDBMonitoring),
+ smc.handlers.Enqueue,
+ smc.deleteScyllaDBMonitoring,
+ )
+}
+
+func (smc *Controller) deleteScyllaDBMonitoring(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.Enqueue,
+ )
+}
+
+func (smc *Controller) addConfigMap(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*corev1.ConfigMap),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateConfigMap(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*corev1.ConfigMap),
+ cur.(*corev1.ConfigMap),
+ smc.handlers.EnqueueOwner,
+ smc.deleteConfigMap,
+ )
+}
+
+func (smc *Controller) deleteConfigMap(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addSecret(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*corev1.Secret),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateSecret(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*corev1.Secret),
+ cur.(*corev1.Secret),
+ smc.handlers.EnqueueOwner,
+ smc.deleteSecret,
+ )
+}
+
+func (smc *Controller) deleteSecret(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addService(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*corev1.Service),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateService(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*corev1.Service),
+ cur.(*corev1.Service),
+ smc.handlers.EnqueueOwner,
+ smc.deleteService,
+ )
+}
+
+func (smc *Controller) deleteService(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addServiceAccount(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*corev1.ServiceAccount),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateServiceAccount(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*corev1.ServiceAccount),
+ cur.(*corev1.ServiceAccount),
+ smc.handlers.EnqueueOwner,
+ smc.deleteServiceAccount,
+ )
+}
+
+func (smc *Controller) deleteServiceAccount(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addPodDisruptionBudget(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*policyv1.PodDisruptionBudget),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updatePodDisruptionBudget(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*policyv1.PodDisruptionBudget),
+ cur.(*policyv1.PodDisruptionBudget),
+ smc.handlers.EnqueueOwner,
+ smc.deletePodDisruptionBudget,
+ )
+}
+
+func (smc *Controller) deletePodDisruptionBudget(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addDeployment(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*appsv1.Deployment),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateDeployment(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*appsv1.Deployment),
+ cur.(*appsv1.Deployment),
+ smc.handlers.EnqueueOwner,
+ smc.deleteDeployment,
+ )
+}
+
+func (smc *Controller) deleteDeployment(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addIngress(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*networkingv1.Ingress),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateIngress(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*networkingv1.Ingress),
+ cur.(*networkingv1.Ingress),
+ smc.handlers.EnqueueOwner,
+ smc.deleteIngress,
+ )
+}
+
+func (smc *Controller) deleteIngress(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addPrometheus(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*monitoringv1.Prometheus),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updatePrometheus(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*monitoringv1.Prometheus),
+ cur.(*monitoringv1.Prometheus),
+ smc.handlers.EnqueueOwner,
+ smc.deletePrometheus,
+ )
+}
+
+func (smc *Controller) deletePrometheus(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addPrometheusRule(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*monitoringv1.PrometheusRule),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updatePrometheusRule(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*monitoringv1.PrometheusRule),
+ cur.(*monitoringv1.PrometheusRule),
+ smc.handlers.EnqueueOwner,
+ smc.deletePrometheusRule,
+ )
+}
+
+func (smc *Controller) deletePrometheusRule(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) addServiceMonitor(obj interface{}) {
+ smc.handlers.HandleAdd(
+ obj.(*monitoringv1.ServiceMonitor),
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) updateServiceMonitor(old, cur interface{}) {
+ smc.handlers.HandleUpdate(
+ old.(*monitoringv1.ServiceMonitor),
+ cur.(*monitoringv1.ServiceMonitor),
+ smc.handlers.EnqueueOwner,
+ smc.deleteServiceMonitor,
+ )
+}
+
+func (smc *Controller) deleteServiceMonitor(obj interface{}) {
+ smc.handlers.HandleDelete(
+ obj,
+ smc.handlers.EnqueueOwner,
+ )
+}
+
+func (smc *Controller) processNextItem(ctx context.Context) bool {
+ key, quit := smc.queue.Get()
+ if quit {
+ return false
+ }
+ defer smc.queue.Done(key)
+
+ err := smc.sync(ctx, key.(string))
+ // TODO: Do smarter filtering then just Reduce to handle cases like 2 conflict errors.
+ err = utilerrors.Reduce(err)
+ switch {
+ case err == nil:
+ smc.queue.Forget(key)
+ return true
+
+ case apierrors.IsConflict(err):
+ klog.V(2).InfoS("Hit conflict, will retry in a bit", "Key", key, "Error", err)
+
+ case apierrors.IsAlreadyExists(err):
+ klog.V(2).InfoS("Hit already exists, will retry in a bit", "Key", key, "Error", err)
+
+ default:
+ utilruntime.HandleError(fmt.Errorf("syncing key '%v' failed: %v", key, err))
+ }
+
+ smc.queue.AddRateLimited(key)
+
+ return true
+}
+
+func (smc *Controller) runWorker(ctx context.Context) {
+ for smc.processNextItem(ctx) {
+ }
+}
+
+func (smc *Controller) Run(ctx context.Context, workers int) {
+ defer utilruntime.HandleCrash()
+
+ klog.InfoS("Starting controller", "controller", "ScyllaDBMonitoring")
+
+ var wg sync.WaitGroup
+ defer func() {
+ klog.InfoS("Shutting down controller", "controller", "ScyllaDBMonitoring")
+ smc.queue.ShutDown()
+ wg.Wait()
+ klog.InfoS("Shut down controller", "controller", "ScyllaDBMonitoring")
+ }()
+
+ if !cache.WaitForNamedCacheSync(ControllerName, ctx.Done(), smc.cachesToSync...) {
+ return
+ }
+
+ for i := 0; i < workers; i++ {
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ wait.UntilWithContext(ctx, smc.runWorker, time.Second)
+ }()
+ }
+
+ <-ctx.Done()
+}
diff --git a/pkg/controller/scylladbmonitoring/status.go b/pkg/controller/scylladbmonitoring/status.go
new file mode 100644
index 00000000000..565799c2152
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/status.go
@@ -0,0 +1,41 @@
+package scylladbmonitoring
+
+import (
+ "context"
+
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ apiequality "k8s.io/apimachinery/pkg/api/equality"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/klog/v2"
+ "k8s.io/utils/pointer"
+)
+
+// calculateStatus calculates the ScyllaDBMonitoring status.
+// This function should always succeed. Do not return an error.
+// If a particular object can be missing, it should be reflected in the value itself, like "Unknown" or "".
+func (smc *Controller) calculateStatus(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.ScyllaDBMonitoringStatus {
+ status := sm.Status.DeepCopy()
+ status.ObservedGeneration = pointer.Int64(sm.Generation)
+
+ return status
+}
+
+func (smc *Controller) updateStatus(ctx context.Context, currentSM *scyllav1alpha1.ScyllaDBMonitoring, status *scyllav1alpha1.ScyllaDBMonitoringStatus) error {
+ if apiequality.Semantic.DeepEqual(¤tSM.Status, status) {
+ return nil
+ }
+
+ sm := currentSM.DeepCopy()
+ sm.Status = *status
+
+ klog.V(2).InfoS("Updating status", "ScyllaDBMonitoring", klog.KObj(sm))
+
+ _, err := smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).UpdateStatus(ctx, sm, metav1.UpdateOptions{})
+ if err != nil {
+ return err
+ }
+
+ klog.V(2).InfoS("Status updated", "ScyllaDBMonitoring", klog.KObj(sm))
+
+ return nil
+}
diff --git a/pkg/controller/scylladbmonitoring/sync.go b/pkg/controller/scylladbmonitoring/sync.go
new file mode 100644
index 00000000000..03b5c1ca9bc
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/sync.go
@@ -0,0 +1,283 @@
+package scylladbmonitoring
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ "github.com/scylladb/scylla-operator/pkg/controllerhelpers"
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ "github.com/scylladb/scylla-operator/pkg/naming"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ rbacv1 "k8s.io/api/rbac/v1"
+ "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/labels"
+ utilerrors "k8s.io/apimachinery/pkg/util/errors"
+ "k8s.io/client-go/tools/cache"
+ "k8s.io/klog/v2"
+)
+
+func getLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set {
+ return labels.Set{
+ naming.ScyllaDBMonitoringNameLabel: sm.Name,
+ }
+}
+
+func getSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector {
+ return labels.SelectorFromSet(getLabels(sm))
+}
+
+func (smc *Controller) sync(ctx context.Context, key string) error {
+ namespace, name, err := cache.SplitMetaNamespaceKey(key)
+ if err != nil {
+ return fmt.Errorf("can't split meta namespace cache key %q: %w", key, err)
+ }
+
+ startTime := time.Now()
+ klog.V(4).InfoS("Started syncing ScyllaDBMonitoring", "ScyllaDBMonitoring", klog.KRef(namespace, name), "startTime", startTime)
+ defer func() {
+ klog.V(4).InfoS("Finished syncing ScyllaDBMonitoring", "ScyllaDBMonitoring", klog.KRef(namespace, name), "duration", time.Since(startTime))
+ }()
+
+ sm, err := smc.scylladbMonitoringLister.ScyllaDBMonitorings(namespace).Get(name)
+ if errors.IsNotFound(err) {
+ klog.V(2).InfoS("ScyllaDBMonitoring has been deleted", "ScyllaDBMonitoring", klog.KObj(sm))
+ return nil
+ }
+ if err != nil {
+ return fmt.Errorf("can't get object %q from cache: %w", naming.ManualRef(namespace, name), err)
+ }
+
+ smSelector := getSelector(sm)
+
+ type CT = *scyllav1alpha1.ScyllaDBMonitoring
+ var objectErrs []error
+
+ configMaps, err := controllerhelpers.GetObjects[CT, *corev1.ConfigMap](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.ConfigMap]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.configMapLister.ConfigMaps(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get config maps: %w", err))
+ }
+
+ secrets, err := controllerhelpers.GetObjects[CT, *corev1.Secret](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.Secret]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.secretLister.Secrets(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get secrets: %w", err))
+ }
+
+ services, err := controllerhelpers.GetObjects[CT, *corev1.Service](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.Service]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.serviceLister.Services(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get services: %w", err))
+ }
+
+ serviceAccounts, err := controllerhelpers.GetObjects[CT, *corev1.ServiceAccount](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *corev1.ServiceAccount]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get service accounts: %w", err))
+ }
+
+ roleBindings, err := controllerhelpers.GetObjects[CT, *rbacv1.RoleBinding](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *rbacv1.RoleBinding]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.roleBindingLister.RoleBindings(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get role bindings: %w", err))
+ }
+
+ deployments, err := controllerhelpers.GetObjects[CT, *appsv1.Deployment](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *appsv1.Deployment]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.deploymentLister.Deployments(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get deployments: %w", err))
+ }
+
+ ingresses, err := controllerhelpers.GetObjects[CT, *networkingv1.Ingress](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *networkingv1.Ingress]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.ingressLister.Ingresses(sm.Namespace).List,
+ PatchObjectFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get ingresses: %w", err))
+ }
+
+ prometheuses, err := controllerhelpers.GetObjects[CT, *monitoringv1.Prometheus](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.Prometheus]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.prometheusLister.Prometheuses(sm.Namespace).List,
+ PatchObjectFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get prometheuses: %w", err))
+ }
+
+ prometheusRules, err := controllerhelpers.GetObjects[CT, *monitoringv1.PrometheusRule](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.PrometheusRule]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).List,
+ PatchObjectFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get prometheus rules: %w", err))
+ }
+
+ serviceMonitors, err := controllerhelpers.GetObjects[CT, *monitoringv1.ServiceMonitor](
+ ctx,
+ sm,
+ scylladbMonitoringControllerGVK,
+ smSelector,
+ controllerhelpers.ControlleeManagerGetObjectsFuncs[CT, *monitoringv1.ServiceMonitor]{
+ GetControllerUncachedFunc: smc.scyllaV1alpha1Client.ScyllaDBMonitorings(sm.Namespace).Get,
+ ListObjectsFunc: smc.serviceMonitorLister.ServiceMonitors(sm.Namespace).List,
+ PatchObjectFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Patch,
+ },
+ )
+ if err != nil {
+ objectErrs = append(objectErrs, fmt.Errorf("can't get service monitors: %w", err))
+ }
+
+ objectErr := utilerrors.NewAggregate(objectErrs)
+ if objectErr != nil {
+ return objectErr
+ }
+
+ prometheusSelector := getPrometheusSelector(sm)
+ grafanaSelector := getGrafanaSelector(sm)
+
+ status := smc.calculateStatus(sm)
+
+ if sm.DeletionTimestamp != nil {
+ return smc.updateStatus(ctx, sm, status)
+ }
+
+ var errs []error
+
+ err = controllerhelpers.RunSync(
+ &status.Conditions,
+ prometheusControllerProgressingCondition,
+ prometheusControllerDegradedCondition,
+ sm.Generation,
+ func() ([]metav1.Condition, error) {
+ return smc.syncPrometheus(
+ ctx,
+ sm,
+ controllerhelpers.FilterObjectMapByLabel(configMaps, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(secrets, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(services, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(serviceAccounts, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(roleBindings, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(ingresses, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(prometheuses, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(prometheusRules, prometheusSelector),
+ controllerhelpers.FilterObjectMapByLabel(serviceMonitors, prometheusSelector),
+ )
+ },
+ )
+ if err != nil {
+ errs = append(errs, fmt.Errorf("can't sync prometheus: %w", err))
+ }
+
+ err = controllerhelpers.RunSync(
+ &status.Conditions,
+ grafanaControllerProgressingCondition,
+ grafanaControllerDegradedCondition,
+ sm.Generation,
+ func() ([]metav1.Condition, error) {
+ return smc.syncGrafana(
+ ctx,
+ sm,
+ controllerhelpers.FilterObjectMapByLabel(configMaps, grafanaSelector),
+ controllerhelpers.FilterObjectMapByLabel(secrets, grafanaSelector),
+ controllerhelpers.FilterObjectMapByLabel(services, grafanaSelector),
+ controllerhelpers.FilterObjectMapByLabel(serviceAccounts, grafanaSelector),
+ controllerhelpers.FilterObjectMapByLabel(deployments, grafanaSelector),
+ controllerhelpers.FilterObjectMapByLabel(ingresses, grafanaSelector),
+ )
+ },
+ )
+ if err != nil {
+ errs = append(errs, fmt.Errorf("can't sync grafana: %w", err))
+ }
+
+ // Aggregate conditions.
+ err = controllerhelpers.SetAggregatedWorkloadConditions(&status.Conditions, sm.Generation)
+ if err != nil {
+ errs = append(errs, fmt.Errorf("can't aggregate workload conditions: %w", err))
+ } else {
+ err = smc.updateStatus(ctx, sm, status)
+ errs = append(errs, err)
+ }
+
+ return utilerrors.NewAggregate(errs)
+}
diff --git a/pkg/controller/scylladbmonitoring/sync_grafana.go b/pkg/controller/scylladbmonitoring/sync_grafana.go
new file mode 100644
index 00000000000..828b577259c
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/sync_grafana.go
@@ -0,0 +1,509 @@
+package scylladbmonitoring
+
+import (
+ "context"
+ "crypto/x509/pkix"
+ "fmt"
+ "time"
+
+ grafanav1alpha1assets "github.com/scylladb/scylla-operator/assets/monitoring/grafana/v1alpha1"
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ "github.com/scylladb/scylla-operator/pkg/controllerhelpers"
+ ocrypto "github.com/scylladb/scylla-operator/pkg/crypto"
+ "github.com/scylladb/scylla-operator/pkg/helpers"
+ okubecrypto "github.com/scylladb/scylla-operator/pkg/kubecrypto"
+ "github.com/scylladb/scylla-operator/pkg/naming"
+ "github.com/scylladb/scylla-operator/pkg/resource"
+ "github.com/scylladb/scylla-operator/pkg/resourceapply"
+ "github.com/scylladb/scylla-operator/pkg/resourcemerge"
+ "github.com/scylladb/scylla-operator/pkg/util/hash"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/labels"
+ kutilerrors "k8s.io/apimachinery/pkg/util/errors"
+ "k8s.io/apimachinery/pkg/util/rand"
+ "k8s.io/utils/pointer"
+)
+
+const (
+ grafanaPasswordLength = 20
+)
+
+func getGrafanaLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set {
+ return helpers.MergeMaps(
+ getLabels(sm),
+ labels.Set{
+ naming.ControllerNameLabel: "grafana",
+ },
+ )
+}
+
+func getGrafanaSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector {
+ return labels.SelectorFromSet(getGrafanaLabels(sm))
+}
+
+func getGrafanaSpec(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.GrafanaSpec {
+ if sm.Spec.Components != nil {
+ return sm.Spec.Components.Grafana
+ }
+
+ return nil
+}
+
+func getGrafanaIngressOptions(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.IngressOptions {
+ spec := getGrafanaSpec(sm)
+ if spec != nil &&
+ spec.ExposeOptions != nil &&
+ spec.ExposeOptions.WebInterface != nil {
+ return spec.ExposeOptions.WebInterface.Ingress
+ }
+
+ return nil
+}
+
+func getGrafanaIngressDomains(sm *scyllav1alpha1.ScyllaDBMonitoring) []string {
+ ingressOptions := getGrafanaIngressOptions(sm)
+ if ingressOptions != nil {
+ return ingressOptions.DNSDomains
+ }
+
+ return nil
+}
+
+func makeGrafanaDeployment(sm *scyllav1alpha1.ScyllaDBMonitoring, grafanaServingCertSecretName string, restartTriggerHash string) (*appsv1.Deployment, string, error) {
+ spec := getGrafanaSpec(sm)
+
+ var affinity corev1.Affinity
+ var tolerations []corev1.Toleration
+ if spec != nil && spec.Placement != nil {
+ affinity.NodeAffinity = spec.Placement.NodeAffinity
+ affinity.PodAffinity = spec.Placement.PodAffinity
+ affinity.PodAntiAffinity = spec.Placement.PodAntiAffinity
+
+ tolerations = spec.Placement.Tolerations
+ }
+
+ var resources corev1.ResourceRequirements
+ if spec != nil {
+ resources = spec.Resources
+ }
+
+ return grafanav1alpha1assets.GrafanaDeploymentTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ "servingCertSecretName": grafanaServingCertSecretName,
+ "affinity": affinity,
+ "tolerations": tolerations,
+ "resources": resources,
+ "restartTriggerHash": restartTriggerHash,
+ })
+}
+
+func makeGrafanaAdminCredentials(sm *scyllav1alpha1.ScyllaDBMonitoring, secrets map[string]*corev1.Secret) (*corev1.Secret, string, error) {
+ var existingPassword []byte
+
+ secretName := sm.Name + "-grafana-admin-credentials"
+ existingSecret, found := secrets[secretName]
+ if found {
+ existingPassword = existingSecret.Data["password"]
+ }
+
+ if len(existingPassword) == 0 {
+ existingPassword = []byte(rand.String(grafanaPasswordLength))
+ }
+
+ return grafanav1alpha1assets.GrafanaAdminCredentialsSecretTemplate.RenderObject(map[string]any{
+ "name": secretName,
+ "password": existingPassword,
+ })
+}
+
+func makeGrafanaSA(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ServiceAccount, string, error) {
+ return grafanav1alpha1assets.GrafanaSATemplate.RenderObject(map[string]any{
+ "namespace": sm.Namespace,
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeGrafanaConfigs(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) {
+ enableAnonymousAccess := false
+ spec := getGrafanaSpec(sm)
+ if spec != nil {
+ enableAnonymousAccess = spec.Authentication.InsecureEnableAnonymousAccess
+ }
+
+ return grafanav1alpha1assets.GrafanaConfigsTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ "enableAnonymousAccess": enableAnonymousAccess,
+ })
+}
+
+func makeGrafanaDashboards(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) {
+ return grafanav1alpha1assets.GrafanaDashboardsConfigMapTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeGrafanaProvisionings(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ConfigMap, string, error) {
+ return grafanav1alpha1assets.GrafanaProvisioningConfigMapTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeGrafanaService(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.Service, string, error) {
+ return grafanav1alpha1assets.GrafanaServiceTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeGrafanaIngress(sm *scyllav1alpha1.ScyllaDBMonitoring) (*networkingv1.Ingress, string, error) {
+ ingressOptions := getGrafanaIngressOptions(sm)
+ if ingressOptions == nil {
+ return nil, "", nil
+ }
+
+ if ingressOptions.Disabled != nil && *ingressOptions.Disabled == true {
+ return nil, "", nil
+ }
+
+ if len(ingressOptions.DNSDomains) == 0 {
+ return nil, "", nil
+ }
+
+ return grafanav1alpha1assets.GrafanaIngressTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ "dnsDomains": ingressOptions.DNSDomains,
+ "ingressAnnotations": ingressOptions.Annotations,
+ "ingressClassName": ingressOptions.IngressClassName,
+ })
+}
+
+func (smc *Controller) syncGrafana(
+ ctx context.Context,
+ sm *scyllav1alpha1.ScyllaDBMonitoring,
+ configMaps map[string]*corev1.ConfigMap,
+ secrets map[string]*corev1.Secret,
+ services map[string]*corev1.Service,
+ serviceAccounts map[string]*corev1.ServiceAccount,
+ deployments map[string]*appsv1.Deployment,
+ ingresses map[string]*networkingv1.Ingress,
+) ([]metav1.Condition, error) {
+ var progressingConditions []metav1.Condition
+
+ grafanaServingCertChainConfig := &okubecrypto.CertChainConfig{
+ CAConfig: &okubecrypto.CAConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-grafana-serving-ca", sm.Name),
+ Labels: getGrafanaLabels(sm),
+ },
+ Validity: 10 * 365 * 24 * time.Hour,
+ Refresh: 8 * 365 * 24 * time.Hour,
+ },
+ CABundleConfig: &okubecrypto.CABundleConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-grafana-serving-ca", sm.Name),
+ Labels: getGrafanaLabels(sm),
+ },
+ },
+ CertConfigs: []*okubecrypto.CertificateConfig{
+ {
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-grafana-serving-certs", sm.Name),
+ Labels: getGrafanaLabels(sm),
+ },
+ Validity: 30 * 24 * time.Hour,
+ Refresh: 20 * 24 * time.Hour,
+ CertCreator: (&ocrypto.ServingCertCreatorConfig{
+ Subject: pkix.Name{
+ CommonName: "",
+ },
+ IPAddresses: nil,
+ DNSNames: append(
+ []string{
+ sm.Name + "-grafana",
+ },
+ getGrafanaIngressDomains(sm)...,
+ ),
+ }).ToCreator(),
+ },
+ },
+ }
+
+ var certChainConfigs okubecrypto.CertChainConfigs
+
+ spec := getGrafanaSpec(sm)
+
+ var grafanaServingCertSecretName string
+ if spec != nil {
+ grafanaServingCertSecretName = spec.ServingCertSecretName
+ }
+
+ if len(grafanaServingCertSecretName) == 0 {
+ grafanaServingCertSecretName = grafanaServingCertChainConfig.CertConfigs[0].Name
+ certChainConfigs = append(certChainConfigs, grafanaServingCertChainConfig)
+ }
+
+ // Render manifests.
+ var renderErrors []error
+
+ requiredGrafanaSA, _, err := makeGrafanaSA(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredConfigsCM, _, err := makeGrafanaConfigs(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredDahsboardsCM, _, err := makeGrafanaDashboards(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredProvisioningsCM, _, err := makeGrafanaProvisionings(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredAdminCredentialsSecret, _, err := makeGrafanaAdminCredentials(sm, secrets)
+ renderErrors = append(renderErrors, err)
+
+ var requiredDeployment *appsv1.Deployment
+ // Trigger restart for inputs that are not live reloaded.
+ grafanaRestartHash, hashErr := hash.HashObjects(requiredConfigsCM, requiredProvisioningsCM)
+ if hashErr != nil {
+ renderErrors = append(renderErrors, hashErr)
+ } else {
+ requiredDeployment, _, err = makeGrafanaDeployment(sm, grafanaServingCertSecretName, grafanaRestartHash)
+ renderErrors = append(renderErrors, err)
+ }
+
+ requiredService, _, err := makeGrafanaService(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredIngress, _, err := makeGrafanaIngress(sm)
+ renderErrors = append(renderErrors, err)
+
+ renderError := kutilerrors.NewAggregate(renderErrors)
+ if renderError != nil {
+ return progressingConditions, renderError
+ }
+
+ // Prune objects.
+ var pruneErrors []error
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredGrafanaSA),
+ serviceAccounts,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ append(
+ []*corev1.ConfigMap{
+ requiredConfigsCM,
+ requiredDahsboardsCM,
+ requiredProvisioningsCM,
+ },
+ certChainConfigs.GetMetaConfigMaps()...,
+ ),
+ configMaps,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ append([]*corev1.Secret{requiredAdminCredentialsSecret}, certChainConfigs.GetMetaSecrets()...),
+ secrets,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredService),
+ services,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredDeployment),
+ deployments,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.FilterOutNil(helpers.ToArray(requiredIngress)),
+ ingresses,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ pruneError := kutilerrors.NewAggregate(pruneErrors)
+ if pruneError != nil {
+ return progressingConditions, pruneError
+ }
+
+ // Apply required objects.
+ var applyErrors []error
+ applyConfigurations := []resourceapply.ApplyConfigUntyped{
+ resourceapply.ApplyConfig[*corev1.ServiceAccount]{
+ Required: requiredGrafanaSA,
+ Control: resourceapply.ApplyControlFuncs[*corev1.ServiceAccount]{
+ GetCachedFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.ConfigMap]{
+ Required: requiredConfigsCM,
+ Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{
+ GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.ConfigMap]{
+ Required: requiredDahsboardsCM,
+ Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{
+ GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.ConfigMap]{
+ Required: requiredProvisioningsCM,
+ Control: resourceapply.ApplyControlFuncs[*corev1.ConfigMap]{
+ GetCachedFunc: smc.configMapLister.ConfigMaps(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.Secret]{
+ Required: requiredAdminCredentialsSecret,
+ Control: resourceapply.ApplyControlFuncs[*corev1.Secret]{
+ GetCachedFunc: smc.secretLister.Secrets(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*appsv1.Deployment]{
+ Required: requiredDeployment,
+ Control: resourceapply.ApplyControlFuncs[*appsv1.Deployment]{
+ GetCachedFunc: smc.deploymentLister.Deployments(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.AppsV1().Deployments(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.Service]{
+ Required: requiredService,
+ Control: resourceapply.ApplyControlFuncs[*corev1.Service]{
+ GetCachedFunc: smc.serviceLister.Services(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ }
+
+ if requiredIngress != nil {
+ applyConfigurations = append(applyConfigurations, resourceapply.ApplyConfig[*networkingv1.Ingress]{
+ Required: requiredIngress,
+ Control: resourceapply.ApplyControlFuncs[*networkingv1.Ingress]{
+ GetCachedFunc: smc.ingressLister.Ingresses(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete,
+ },
+ }.ToUntyped())
+ }
+
+ for _, cfg := range applyConfigurations {
+ // Enforce namespace.
+ cfg.Required.SetNamespace(sm.Namespace)
+
+ // Enforce labels for selection.
+ if cfg.Required.GetLabels() == nil {
+ cfg.Required.SetLabels(getGrafanaLabels(sm))
+ } else {
+ resourcemerge.MergeMapInPlaceWithoutRemovalKeys(cfg.Required.GetLabels(), getGrafanaLabels(sm))
+ }
+
+ // Set ControllerRef.
+ cfg.Required.SetOwnerReferences([]metav1.OwnerReference{
+ {
+ APIVersion: scylladbMonitoringControllerGVK.GroupVersion().String(),
+ Kind: scylladbMonitoringControllerGVK.Kind,
+ Name: sm.Name,
+ UID: sm.UID,
+ Controller: pointer.Bool(true),
+ BlockOwnerDeletion: pointer.Bool(true),
+ },
+ })
+
+ // Apply required object.
+ _, changed, err := resourceapply.ApplyFromConfig(ctx, cfg, smc.eventRecorder)
+ if changed {
+ controllerhelpers.AddGenericProgressingStatusCondition(&progressingConditions, grafanaControllerProgressingCondition, cfg.Required, "apply", sm.Generation)
+ }
+ if err != nil {
+ gvk := resource.GetObjectGVKOrUnknown(cfg.Required)
+ applyErrors = append(applyErrors, fmt.Errorf("can't apply %s: %w", gvk, err))
+ }
+ }
+
+ cm := okubecrypto.NewCertificateManager(
+ smc.kubeClient.CoreV1(),
+ smc.secretLister,
+ smc.kubeClient.CoreV1(),
+ smc.configMapLister,
+ smc.eventRecorder,
+ )
+ for _, ccc := range certChainConfigs {
+ err := cm.ManageCertificateChain(
+ ctx,
+ time.Now,
+ &sm.ObjectMeta,
+ scylladbMonitoringControllerGVK,
+ ccc,
+ secrets,
+ configMaps,
+ )
+ if err != nil {
+ applyErrors = append(applyErrors, err)
+ }
+ }
+
+ applyError := kutilerrors.NewAggregate(applyErrors)
+ if applyError != nil {
+ return progressingConditions, applyError
+ }
+
+ return progressingConditions, nil
+}
diff --git a/pkg/controller/scylladbmonitoring/sync_grafana_test.go b/pkg/controller/scylladbmonitoring/sync_grafana_test.go
new file mode 100644
index 00000000000..d52160bca15
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/sync_grafana_test.go
@@ -0,0 +1,127 @@
+package scylladbmonitoring
+
+import (
+ "reflect"
+ "strings"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func Test_makeGrafanaIngress(t *testing.T) {
+ tt := []struct {
+ name string
+ sm *scyllav1alpha1.ScyllaDBMonitoring
+ expectedString string
+ expectedErr error
+ }{
+ {
+ name: "empty annotations",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{
+ Components: &scyllav1alpha1.Components{
+ Grafana: &scyllav1alpha1.GrafanaSpec{
+ ExposeOptions: &scyllav1alpha1.GrafanaExposeOptions{
+ WebInterface: &scyllav1alpha1.HTTPSExposeOptions{
+ Ingress: &scyllav1alpha1.IngressOptions{
+ DNSDomains: []string{"grafana.localhost"},
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: "sm-name-grafana"
+ annotations:
+ null
+spec:
+ ingressClassName: null
+ rules:
+ - host: "grafana.localhost"
+ http:
+ paths:
+ - backend:
+ service:
+ name: "sm-name-grafana"
+ port:
+ number: 3000
+ path: /
+ pathType: Prefix
+`, "\n"),
+ expectedErr: nil,
+ },
+ {
+ name: "supplied annotations",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{
+ Components: &scyllav1alpha1.Components{
+ Grafana: &scyllav1alpha1.GrafanaSpec{
+ ExposeOptions: &scyllav1alpha1.GrafanaExposeOptions{
+ WebInterface: &scyllav1alpha1.HTTPSExposeOptions{
+ Ingress: &scyllav1alpha1.IngressOptions{
+ Annotations: map[string]string{
+ "ann1": "ann1val",
+ "ann2": "ann2val",
+ },
+ DNSDomains: []string{"grafana.localhost"},
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: "sm-name-grafana"
+ annotations:
+ ann1: ann1val
+ ann2: ann2val
+spec:
+ ingressClassName: null
+ rules:
+ - host: "grafana.localhost"
+ http:
+ paths:
+ - backend:
+ service:
+ name: "sm-name-grafana"
+ port:
+ number: 3000
+ path: /
+ pathType: Prefix
+`, "\n"),
+ expectedErr: nil,
+ },
+ }
+ for _, tc := range tt {
+ t.Run(tc.name, func(t *testing.T) {
+ _, objString, err := makeGrafanaIngress(tc.sm)
+ if !reflect.DeepEqual(err, tc.expectedErr) {
+ t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString)
+ }
+
+ if objString != tc.expectedString {
+ t.Errorf("expected and got strings differ:\n%s", cmp.Diff(
+ strings.Split(tc.expectedString, "\n"),
+ strings.Split(objString, "\n"),
+ ))
+ }
+ })
+ }
+}
diff --git a/pkg/controller/scylladbmonitoring/sync_prometheus.go b/pkg/controller/scylladbmonitoring/sync_prometheus.go
new file mode 100644
index 00000000000..6a6f2921da4
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/sync_prometheus.go
@@ -0,0 +1,543 @@
+package scylladbmonitoring
+
+import (
+ "context"
+ "crypto/x509/pkix"
+ "fmt"
+ "time"
+
+ prometheusv1assets "github.com/scylladb/scylla-operator/assets/monitoring/prometheus/v1"
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ "github.com/scylladb/scylla-operator/pkg/controllerhelpers"
+ ocrypto "github.com/scylladb/scylla-operator/pkg/crypto"
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ "github.com/scylladb/scylla-operator/pkg/helpers"
+ okubecrypto "github.com/scylladb/scylla-operator/pkg/kubecrypto"
+ "github.com/scylladb/scylla-operator/pkg/naming"
+ "github.com/scylladb/scylla-operator/pkg/resource"
+ "github.com/scylladb/scylla-operator/pkg/resourceapply"
+ "github.com/scylladb/scylla-operator/pkg/resourcemerge"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ rbacv1 "k8s.io/api/rbac/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/labels"
+ kutilerrors "k8s.io/apimachinery/pkg/util/errors"
+ "k8s.io/utils/pointer"
+)
+
+func getPrometheusLabels(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Set {
+ return helpers.MergeMaps(
+ getLabels(sm),
+ labels.Set{
+ naming.ControllerNameLabel: "prometheus",
+ },
+ )
+}
+
+func getPrometheusSelector(sm *scyllav1alpha1.ScyllaDBMonitoring) labels.Selector {
+ return labels.SelectorFromSet(getPrometheusLabels(sm))
+}
+
+func getPrometheusSpec(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.PrometheusSpec {
+ if sm.Spec.Components != nil {
+ return sm.Spec.Components.Prometheus
+ }
+
+ return nil
+}
+
+func getPrometheusIngressOptions(sm *scyllav1alpha1.ScyllaDBMonitoring) *scyllav1alpha1.IngressOptions {
+ spec := getPrometheusSpec(sm)
+ if spec != nil &&
+ spec.ExposeOptions != nil &&
+ spec.ExposeOptions.WebInterface != nil {
+ return spec.ExposeOptions.WebInterface.Ingress
+ }
+
+ return nil
+}
+
+func getPrometheusIngressDomains(sm *scyllav1alpha1.ScyllaDBMonitoring) []string {
+ ingressOptions := getPrometheusIngressOptions(sm)
+ if ingressOptions != nil {
+ return ingressOptions.DNSDomains
+ }
+
+ return nil
+}
+
+func makePrometheusSA(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.ServiceAccount, string, error) {
+ return prometheusv1assets.PrometheusSATemplate.RenderObject(map[string]any{
+ "namespace": sm.Namespace,
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makePrometheusRoleBinding(sm *scyllav1alpha1.ScyllaDBMonitoring) (*rbacv1.RoleBinding, string, error) {
+ return prometheusv1assets.PrometheusRoleBindingTemplate.RenderObject(map[string]any{
+ "namespace": sm.Namespace,
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makePrometheusService(sm *scyllav1alpha1.ScyllaDBMonitoring) (*corev1.Service, string, error) {
+ return prometheusv1assets.PrometheusServiceTemplate.RenderObject(map[string]any{
+ "namespace": sm.Namespace,
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeScyllaDBServiceMonitor(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.ServiceMonitor, string, error) {
+ return prometheusv1assets.ScyllaDBServiceMonitorTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ "endpointsSelector": sm.Spec.EndpointsSelector,
+ })
+}
+
+func makeRecodingPrometheusRule(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.PrometheusRule, string, error) {
+ return prometheusv1assets.RecordingPrometheusRuleTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makeAlertsPrometheusRule(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.PrometheusRule, string, error) {
+ return prometheusv1assets.AlertsPrometheusRuleTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ })
+}
+
+func makePrometheus(sm *scyllav1alpha1.ScyllaDBMonitoring) (*monitoringv1.Prometheus, string, error) {
+ spec := getPrometheusSpec(sm)
+
+ var volumeClaimTemplate *monitoringv1.EmbeddedPersistentVolumeClaim
+ if spec != nil && spec.Storage != nil {
+ volumeClaimTemplate = &monitoringv1.EmbeddedPersistentVolumeClaim{
+ EmbeddedObjectMetadata: monitoringv1.EmbeddedObjectMetadata{
+ Name: fmt.Sprintf("%s-prometheus", sm.Name),
+ Labels: spec.Storage.VolumeClaimTemplate.Labels,
+ Annotations: spec.Storage.VolumeClaimTemplate.Annotations,
+ },
+ Spec: spec.Storage.VolumeClaimTemplate.Spec,
+ }
+
+ }
+
+ affinity := corev1.Affinity{}
+ var tolerations []corev1.Toleration
+ if spec != nil && spec.Placement != nil {
+ affinity.NodeAffinity = spec.Placement.NodeAffinity
+ affinity.PodAffinity = spec.Placement.PodAffinity
+ affinity.PodAntiAffinity = spec.Placement.PodAntiAffinity
+
+ tolerations = spec.Placement.Tolerations
+ }
+
+ var resources corev1.ResourceRequirements
+ if spec != nil {
+ resources = spec.Resources
+ }
+
+ return prometheusv1assets.PrometheusTemplate.RenderObject(map[string]any{
+ "namespace": sm.Namespace,
+ "scyllaDBMonitoringName": sm.Name,
+ "volumeClaimTemplate": volumeClaimTemplate,
+ "affinity": affinity,
+ "tolerations": tolerations,
+ "resources": resources,
+ })
+}
+
+func makePrometheusIngress(sm *scyllav1alpha1.ScyllaDBMonitoring) (*networkingv1.Ingress, string, error) {
+ ingressOptions := getPrometheusIngressOptions(sm)
+ if ingressOptions == nil {
+ return nil, "", nil
+ }
+
+ if ingressOptions.Disabled != nil && *ingressOptions.Disabled == true {
+ return nil, "", nil
+ }
+
+ if len(ingressOptions.DNSDomains) == 0 {
+ return nil, "", nil
+ }
+
+ return prometheusv1assets.PrometheusIngressTemplate.RenderObject(map[string]any{
+ "scyllaDBMonitoringName": sm.Name,
+ "dnsDomains": ingressOptions.DNSDomains,
+ "ingressAnnotations": ingressOptions.Annotations,
+ "ingressClassName": ingressOptions.IngressClassName,
+ })
+}
+
+func (smc *Controller) syncPrometheus(
+ ctx context.Context,
+ sm *scyllav1alpha1.ScyllaDBMonitoring,
+ configMaps map[string]*corev1.ConfigMap,
+ secrets map[string]*corev1.Secret,
+ services map[string]*corev1.Service,
+ serviceAccounts map[string]*corev1.ServiceAccount,
+ roleBindings map[string]*rbacv1.RoleBinding,
+ ingresses map[string]*networkingv1.Ingress,
+ prometheuses map[string]*monitoringv1.Prometheus,
+ prometheusRules map[string]*monitoringv1.PrometheusRule,
+ serviceMonitors map[string]*monitoringv1.ServiceMonitor,
+) ([]metav1.Condition, error) {
+ var progressingConditions []metav1.Condition
+
+ prometheusServingCertChainConfig := &okubecrypto.CertChainConfig{
+ CAConfig: &okubecrypto.CAConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-serving-ca", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ Validity: 10 * 365 * 24 * time.Hour,
+ Refresh: 8 * 365 * 24 * time.Hour,
+ },
+ CABundleConfig: &okubecrypto.CABundleConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-serving-ca", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ },
+ CertConfigs: []*okubecrypto.CertificateConfig{
+ {
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-serving-certs", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ Validity: 30 * 24 * time.Hour,
+ Refresh: 20 * 24 * time.Hour,
+ CertCreator: (&ocrypto.ServingCertCreatorConfig{
+ Subject: pkix.Name{
+ CommonName: "",
+ },
+ IPAddresses: nil,
+ DNSNames: append(
+ []string{
+ fmt.Sprintf("%s-prometheus", sm.Name),
+ fmt.Sprintf("%s-prometheus.%s.svc", sm.Name, sm.Namespace),
+ },
+ getPrometheusIngressDomains(sm)...,
+ ),
+ }).ToCreator(),
+ },
+ },
+ }
+
+ prometheusClientCertChainConfig := &okubecrypto.CertChainConfig{
+ CAConfig: &okubecrypto.CAConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-client-ca", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ Validity: 10 * 365 * 24 * time.Hour,
+ Refresh: 8 * 365 * 24 * time.Hour,
+ },
+ CABundleConfig: &okubecrypto.CABundleConfig{
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-client-ca", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ },
+ CertConfigs: []*okubecrypto.CertificateConfig{
+ {
+ MetaConfig: okubecrypto.MetaConfig{
+ Name: fmt.Sprintf("%s-prometheus-client-grafana", sm.Name),
+ Labels: getPrometheusLabels(sm),
+ },
+ Validity: 10 * 365 * 24 * time.Hour,
+ Refresh: 8 * 365 * 24 * time.Hour,
+ CertCreator: (&ocrypto.ClientCertCreatorConfig{
+ Subject: pkix.Name{
+ CommonName: "",
+ },
+ DNSNames: []string{"grafana"},
+ }).ToCreator(),
+ },
+ },
+ }
+
+ certChainConfigs := okubecrypto.CertChainConfigs{
+ prometheusServingCertChainConfig,
+ prometheusClientCertChainConfig,
+ }
+
+ // Render manifests.
+ var renderErrors []error
+
+ requiredPrometheusSA, _, err := makePrometheusSA(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredPrometheusRoleBinding, _, err := makePrometheusRoleBinding(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredPrometheusService, _, err := makePrometheusService(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredIngress, _, err := makePrometheusIngress(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredPrometheus, _, err := makePrometheus(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredRecodingPrometheusRule, _, err := makeRecodingPrometheusRule(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredAlertsPrometheusRule, _, err := makeAlertsPrometheusRule(sm)
+ renderErrors = append(renderErrors, err)
+
+ requiredScyllaDBServiceMonitor, _, err := makeScyllaDBServiceMonitor(sm)
+ renderErrors = append(renderErrors, err)
+
+ renderError := kutilerrors.NewAggregate(renderErrors)
+ if renderError != nil {
+ return progressingConditions, renderError
+ }
+
+ // Prune objects.
+ var pruneErrors []error
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredPrometheusSA),
+ serviceAccounts,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredPrometheusService),
+ services,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredPrometheusRoleBinding),
+ roleBindings,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredPrometheus),
+ prometheuses,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.FilterOutNil(helpers.ToArray(requiredIngress)),
+ ingresses,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredRecodingPrometheusRule, requiredAlertsPrometheusRule),
+ prometheusRules,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ helpers.ToArray(requiredScyllaDBServiceMonitor),
+ serviceMonitors,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ certChainConfigs.GetMetaSecrets(),
+ secrets,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().Secrets(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ err = controllerhelpers.Prune(
+ ctx,
+ certChainConfigs.GetMetaConfigMaps(),
+ configMaps,
+ &controllerhelpers.PruneControlFuncs{
+ DeleteFunc: smc.kubeClient.CoreV1().ConfigMaps(sm.Namespace).Delete,
+ },
+ smc.eventRecorder,
+ )
+ pruneErrors = append(pruneErrors, err)
+
+ pruneError := kutilerrors.NewAggregate(pruneErrors)
+ if pruneError != nil {
+ return progressingConditions, pruneError
+ }
+
+ // Apply required objects.
+ var applyErrors []error
+ applyConfigurations := []resourceapply.ApplyConfigUntyped{
+ resourceapply.ApplyConfig[*corev1.ServiceAccount]{
+ Required: requiredPrometheusSA,
+ Control: resourceapply.ApplyControlFuncs[*corev1.ServiceAccount]{
+ GetCachedFunc: smc.serviceAccountLister.ServiceAccounts(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.CoreV1().ServiceAccounts(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*corev1.Service]{
+ Required: requiredPrometheusService,
+ Control: resourceapply.ApplyControlFuncs[*corev1.Service]{
+ GetCachedFunc: smc.serviceLister.Services(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.CoreV1().Services(sm.Namespace).Update,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*rbacv1.RoleBinding]{
+ Required: requiredPrometheusRoleBinding,
+ Control: resourceapply.ApplyControlFuncs[*rbacv1.RoleBinding]{
+ GetCachedFunc: smc.roleBindingLister.RoleBindings(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.RbacV1().RoleBindings(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*monitoringv1.Prometheus]{
+ Required: requiredPrometheus,
+ Control: resourceapply.ApplyControlFuncs[*monitoringv1.Prometheus]{
+ GetCachedFunc: smc.prometheusLister.Prometheuses(sm.Namespace).Get,
+ CreateFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Create,
+ UpdateFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Update,
+ DeleteFunc: smc.monitoringClient.Prometheuses(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*monitoringv1.ServiceMonitor]{
+ Required: requiredScyllaDBServiceMonitor,
+ Control: resourceapply.ApplyControlFuncs[*monitoringv1.ServiceMonitor]{
+ GetCachedFunc: smc.serviceMonitorLister.ServiceMonitors(sm.Namespace).Get,
+ CreateFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Create,
+ UpdateFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Update,
+ DeleteFunc: smc.monitoringClient.ServiceMonitors(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*monitoringv1.PrometheusRule]{
+ Required: requiredRecodingPrometheusRule,
+ Control: resourceapply.ApplyControlFuncs[*monitoringv1.PrometheusRule]{
+ GetCachedFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).Get,
+ CreateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Create,
+ UpdateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Update,
+ DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ resourceapply.ApplyConfig[*monitoringv1.PrometheusRule]{
+ Required: requiredAlertsPrometheusRule,
+ Control: resourceapply.ApplyControlFuncs[*monitoringv1.PrometheusRule]{
+ GetCachedFunc: smc.prometheusRuleLister.PrometheusRules(sm.Namespace).Get,
+ CreateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Create,
+ UpdateFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Update,
+ DeleteFunc: smc.monitoringClient.PrometheusRules(sm.Namespace).Delete,
+ },
+ }.ToUntyped(),
+ }
+
+ if requiredIngress != nil {
+ applyConfigurations = append(applyConfigurations, resourceapply.ApplyConfig[*networkingv1.Ingress]{
+ Required: requiredIngress,
+ Control: resourceapply.ApplyControlFuncs[*networkingv1.Ingress]{
+ GetCachedFunc: smc.ingressLister.Ingresses(sm.Namespace).Get,
+ CreateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Create,
+ UpdateFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Update,
+ DeleteFunc: smc.kubeClient.NetworkingV1().Ingresses(sm.Namespace).Delete,
+ },
+ }.ToUntyped())
+ }
+
+ for _, cfg := range applyConfigurations {
+ // Enforce namespace.
+ cfg.Required.SetNamespace(sm.Namespace)
+
+ // Enforce labels for selection.
+ if cfg.Required.GetLabels() == nil {
+ cfg.Required.SetLabels(getPrometheusLabels(sm))
+ } else {
+ resourcemerge.MergeMapInPlaceWithoutRemovalKeys(cfg.Required.GetLabels(), getPrometheusLabels(sm))
+ }
+
+ // Set ControllerRef.
+ cfg.Required.SetOwnerReferences([]metav1.OwnerReference{
+ {
+ APIVersion: scylladbMonitoringControllerGVK.GroupVersion().String(),
+ Kind: scylladbMonitoringControllerGVK.Kind,
+ Name: sm.Name,
+ UID: sm.UID,
+ Controller: pointer.Bool(true),
+ BlockOwnerDeletion: pointer.Bool(true),
+ },
+ })
+
+ // Apply required object.
+ _, changed, err := resourceapply.ApplyFromConfig(ctx, cfg, smc.eventRecorder)
+ if changed {
+ controllerhelpers.AddGenericProgressingStatusCondition(&progressingConditions, prometheusControllerProgressingCondition, cfg.Required, "apply", sm.Generation)
+ }
+ if err != nil {
+ gvk := resource.GetObjectGVKOrUnknown(cfg.Required)
+ applyErrors = append(applyErrors, fmt.Errorf("can't apply %s: %w", gvk, err))
+ }
+ }
+
+ cm := okubecrypto.NewCertificateManager(
+ smc.kubeClient.CoreV1(),
+ smc.secretLister,
+ smc.kubeClient.CoreV1(),
+ smc.configMapLister,
+ smc.eventRecorder,
+ )
+ for _, ccc := range certChainConfigs {
+ applyErrors = append(applyErrors, cm.ManageCertificateChain(
+ ctx,
+ time.Now,
+ &sm.ObjectMeta,
+ scylladbMonitoringControllerGVK,
+ ccc,
+ secrets,
+ configMaps,
+ ))
+ }
+
+ applyError := kutilerrors.NewAggregate(applyErrors)
+ if applyError != nil {
+ return progressingConditions, applyError
+ }
+
+ return progressingConditions, nil
+}
diff --git a/pkg/controller/scylladbmonitoring/sync_prometheus_test.go b/pkg/controller/scylladbmonitoring/sync_prometheus_test.go
new file mode 100644
index 00000000000..a921d1dfe9d
--- /dev/null
+++ b/pkg/controller/scylladbmonitoring/sync_prometheus_test.go
@@ -0,0 +1,395 @@
+package scylladbmonitoring
+
+import (
+ "reflect"
+ "strings"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/utils/pointer"
+)
+
+func Test_makeScyllaDBServiceMonitor(t *testing.T) {
+ tt := []struct {
+ name string
+ sm *scyllav1alpha1.ScyllaDBMonitoring
+ expectedString string
+ expectedErr error
+ }{
+ {
+ name: "empty selector",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: "sm-name-scylladb"
+spec:
+ selector:
+ {}
+ jobLabel: scylla/cluster
+ endpoints:
+ - port: node-exporter
+ honorLabels: false
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):\d+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__address__]
+ regex: '([^:]+)'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [instance]
+ regex: '(.*)'
+ targetLabel: __address__
+ replacement: '${1}:9100'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
+ - port: prometheus
+ honorLabels: false
+ metricRelabelings:
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CPU
+ replacement: 'cpu'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CQL
+ replacement: 'cql'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: OS
+ replacement: 'os'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: IO
+ replacement: 'io'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: Errors
+ replacement: 'errors'
+ - regex: 'help|exported_instance'
+ action: labeldrop
+ - sourceLabels: [version]
+ regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*'
+ replacement: '$1$2'
+ targetLabel: svr
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):.+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
+`, "\n"),
+ expectedErr: nil,
+ },
+ {
+ name: "specific selector",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{
+ EndpointsSelector: metav1.LabelSelector{
+ MatchLabels: map[string]string{
+ "foo": "bar",
+ },
+ MatchExpressions: []metav1.LabelSelectorRequirement{
+ {
+ Key: "alpha",
+ Operator: metav1.LabelSelectorOpExists,
+ Values: []string{"beta"},
+ },
+ },
+ },
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: "sm-name-scylladb"
+spec:
+ selector:
+ matchExpressions:
+ - key: alpha
+ operator: Exists
+ values:
+ - beta
+ matchLabels:
+ foo: bar
+ jobLabel: scylla/cluster
+ endpoints:
+ - port: node-exporter
+ honorLabels: false
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):\d+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__address__]
+ regex: '([^:]+)'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [instance]
+ regex: '(.*)'
+ targetLabel: __address__
+ replacement: '${1}:9100'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
+ - port: prometheus
+ honorLabels: false
+ metricRelabelings:
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CPU
+ replacement: 'cpu'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: CQL
+ replacement: 'cql'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: OS
+ replacement: 'os'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: IO
+ replacement: 'io'
+ - sourceLabels: [version]
+ regex: '(.+)'
+ targetLabel: Errors
+ replacement: 'errors'
+ - regex: 'help|exported_instance'
+ action: labeldrop
+ - sourceLabels: [version]
+ regex: '([0-9]+\.[0-9]+)(\.?[0-9]*).*'
+ replacement: '$1$2'
+ targetLabel: svr
+ relabelings:
+ - sourceLabels: [__address__]
+ regex: '(.*):.+'
+ targetLabel: instance
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_service_label_scylla_cluster]
+ regex: '(.+)'
+ targetLabel: cluster
+ replacement: '${1}'
+ - sourceLabels: [__meta_kubernetes_pod_label_scylla_datacenter]
+ regex: '(.+)'
+ targetLabel: dc
+ replacement: '${1}'
+`, "\n"),
+ expectedErr: nil,
+ },
+ }
+ for _, tc := range tt {
+ t.Run(tc.name, func(t *testing.T) {
+ _, objString, err := makeScyllaDBServiceMonitor(tc.sm)
+ if !reflect.DeepEqual(err, tc.expectedErr) {
+ t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString)
+ }
+
+ if objString != tc.expectedString {
+ t.Errorf("expected and got strings differ:\n%s", cmp.Diff(
+ strings.Split(tc.expectedString, "\n"),
+ strings.Split(objString, "\n"),
+ ))
+ }
+ })
+ }
+}
+
+func Test_makePrometheus(t *testing.T) {
+ tt := []struct {
+ name string
+ sm *scyllav1alpha1.ScyllaDBMonitoring
+ expectedString string
+ expectedErr error
+ }{
+ {
+ name: "no storage",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+ name: "sm-name"
+spec:
+ serviceAccountName: "sm-name-prometheus"
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ fsGroup: 65534
+ web:
+ pageTitle: "ScyllaDB Prometheus"
+ tlsConfig:
+ cert:
+ secret:
+ name: "sm-name-prometheus-serving-certs"
+ key: "tls.crt"
+ keySecret:
+ name: "sm-name-prometheus-serving-certs"
+ key: "tls.key"
+# clientAuthType: "RequireAndVerifyClientCert"
+# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs
+ clientAuthType: "RequestClientCert"
+ client_ca:
+ configMap:
+ name: "sm-name-prometheus-client-ca"
+ key: "ca-bundle.crt"
+ httpConfig:
+ http2: true
+ serviceMonitorSelector:
+ matchLabels: {}
+ affinity:
+ {}
+ tolerations:
+ null
+ resources:
+ {}
+ alerting:
+ alertmanagers:
+ - namespace: ""
+ name: "sm-name"
+ port: web
+ ruleSelector:
+ matchLabels:
+ scylla-operator.scylladb.com/scylladbmonitoring-name: "sm-name"
+`, "\n"),
+ expectedErr: nil,
+ },
+ {
+ name: "with prometheus pvc template",
+ sm: &scyllav1alpha1.ScyllaDBMonitoring{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "sm-name",
+ },
+ Spec: scyllav1alpha1.ScyllaDBMonitoringSpec{
+ Components: &scyllav1alpha1.Components{
+ Prometheus: &scyllav1alpha1.PrometheusSpec{
+ Storage: &scyllav1alpha1.Storage{
+ VolumeClaimTemplate: corev1.PersistentVolumeClaimTemplate{
+ ObjectMeta: metav1.ObjectMeta{},
+ Spec: corev1.PersistentVolumeClaimSpec{
+ StorageClassName: pointer.String("pv-class"),
+ Resources: corev1.ResourceRequirements{
+ Requests: map[corev1.ResourceName]resource.Quantity{
+ corev1.ResourceStorage: resource.MustParse("5Gi"),
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ expectedString: strings.TrimLeft(`
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+ name: "sm-name"
+spec:
+ serviceAccountName: "sm-name-prometheus"
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ fsGroup: 65534
+ web:
+ pageTitle: "ScyllaDB Prometheus"
+ tlsConfig:
+ cert:
+ secret:
+ name: "sm-name-prometheus-serving-certs"
+ key: "tls.crt"
+ keySecret:
+ name: "sm-name-prometheus-serving-certs"
+ key: "tls.key"
+# clientAuthType: "RequireAndVerifyClientCert"
+# TODO: we need the prometheus-operator not to require certs only for /-/readyz or to do exec probes that can read certs
+ clientAuthType: "RequestClientCert"
+ client_ca:
+ configMap:
+ name: "sm-name-prometheus-client-ca"
+ key: "ca-bundle.crt"
+ httpConfig:
+ http2: true
+ serviceMonitorSelector:
+ matchLabels: {}
+ affinity:
+ {}
+ tolerations:
+ null
+ resources:
+ {}
+ alerting:
+ alertmanagers:
+ - namespace: ""
+ name: "sm-name"
+ port: web
+ ruleSelector:
+ matchLabels:
+ scylla-operator.scylladb.com/scylladbmonitoring-name: "sm-name"
+ storage:
+ volumeClaimTemplate:
+ metadata:
+ name: sm-name-prometheus
+ spec:
+ resources:
+ requests:
+ storage: 5Gi
+ storageClassName: pv-class
+ status: {}
+`, "\n"),
+ expectedErr: nil,
+ },
+ }
+ for _, tc := range tt {
+ t.Run(tc.name, func(t *testing.T) {
+ _, objString, err := makePrometheus(tc.sm)
+ if !reflect.DeepEqual(err, tc.expectedErr) {
+ t.Errorf("expected and got errors differ:\n%s\nRendered object:\n%s", cmp.Diff(tc.expectedErr, err), objString)
+ }
+
+ if objString != tc.expectedString {
+ t.Errorf("expected and got strings differ:\n%s", cmp.Diff(
+ strings.Split(tc.expectedString, "\n"),
+ strings.Split(objString, "\n"),
+ ))
+ }
+ })
+ }
+}
diff --git a/pkg/controllerhelpers/prune.go b/pkg/controllerhelpers/prune.go
new file mode 100644
index 00000000000..8b1de3e041e
--- /dev/null
+++ b/pkg/controllerhelpers/prune.go
@@ -0,0 +1,65 @@
+package controllerhelpers
+
+import (
+ "context"
+
+ "github.com/scylladb/scylla-operator/pkg/kubeinterfaces"
+ "github.com/scylladb/scylla-operator/pkg/resource"
+ "github.com/scylladb/scylla-operator/pkg/resourceapply"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ utilerrors "k8s.io/apimachinery/pkg/util/errors"
+ "k8s.io/client-go/tools/record"
+ "k8s.io/klog/v2"
+)
+
+type PruneControlInterface interface {
+ Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
+}
+
+type PruneControlFuncs struct {
+ DeleteFunc func(ctx context.Context, name string, opts metav1.DeleteOptions) error
+}
+
+func (pcf *PruneControlFuncs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
+ return pcf.DeleteFunc(ctx, name, opts)
+}
+
+var _ PruneControlInterface = &PruneControlFuncs{}
+
+func Prune[T kubeinterfaces.ObjectInterface](ctx context.Context, requiredObjects []T, existingObjects map[string]T, control PruneControlInterface, eventRecorder record.EventRecorder) error {
+ var errs []error
+
+ for _, existing := range existingObjects {
+ if existing.GetDeletionTimestamp() != nil {
+ continue
+ }
+
+ isRequired := false
+ for _, required := range requiredObjects {
+ if existing.GetName() == required.GetName() {
+ isRequired = true
+ break
+ }
+ }
+ if isRequired {
+ continue
+ }
+
+ uid := existing.GetUID()
+ propagationPolicy := metav1.DeletePropagationBackground
+ klog.V(2).InfoS("Pruning resource", "GVK", resource.GetObjectGVKOrUnknown(existing), "Ref", klog.KObj(existing))
+ err := control.Delete(ctx, existing.GetName(), metav1.DeleteOptions{
+ Preconditions: &metav1.Preconditions{
+ UID: &uid,
+ },
+ PropagationPolicy: &propagationPolicy,
+ })
+ resourceapply.ReportDeleteEvent(eventRecorder, existing, err)
+ if err != nil {
+ errs = append(errs, err)
+ continue
+ }
+ }
+
+ return utilerrors.NewAggregate(errs)
+}
diff --git a/pkg/controllerhelpers/selectors.go b/pkg/controllerhelpers/selectors.go
new file mode 100644
index 00000000000..e32e72ff047
--- /dev/null
+++ b/pkg/controllerhelpers/selectors.go
@@ -0,0 +1,18 @@
+package controllerhelpers
+
+import (
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/labels"
+)
+
+func FilterObjectMapByLabel[T metav1.Object](objects map[string]T, selector labels.Selector) map[string]T {
+ res := map[string]T{}
+
+ for name, obj := range objects {
+ if selector.Matches(labels.Set(obj.GetLabels())) {
+ res[name] = obj
+ }
+ }
+
+ return res
+}
diff --git a/pkg/helpers/array.go b/pkg/helpers/array.go
new file mode 100644
index 00000000000..749070e821c
--- /dev/null
+++ b/pkg/helpers/array.go
@@ -0,0 +1,40 @@
+package helpers
+
+func ToArray[T any](objs ...T) []T {
+ res := make([]T, 0, len(objs))
+ return append(res, objs...)
+}
+
+func ConvertToArray[To, From any](convert func(From) To, objs ...From) []To {
+ res := make([]To, 0, len(objs))
+
+ for i := range objs {
+ res = append(res, convert(objs[i]))
+ }
+
+ return res
+}
+
+func Filter[T any](array []T, filterFunc func(T) bool) []T {
+ res := make([]T, 0, len(array))
+
+ for i := range array {
+ if filterFunc(array[i]) {
+ res = append(res, array[i])
+ }
+ }
+
+ return res
+}
+
+func FilterOut[T any](array []T, filterOutFunc func(T) bool) []T {
+ return Filter(array, func(t T) bool {
+ return !filterOutFunc(t)
+ })
+}
+
+func FilterOutNil[T any](array []*T) []*T {
+ return FilterOut[*T](array, func(item *T) bool {
+ return item == nil
+ })
+}
diff --git a/pkg/kubecrypto/certmanager.go b/pkg/kubecrypto/certmanager.go
index 86c21f8492c..e49772a281f 100644
--- a/pkg/kubecrypto/certmanager.go
+++ b/pkg/kubecrypto/certmanager.go
@@ -24,16 +24,36 @@ type MetaConfig struct {
Annotations map[string]string
}
+func (c *MetaConfig) GetObjectMeta() *metav1.ObjectMeta {
+ return (&metav1.ObjectMeta{
+ Name: c.Name,
+ Labels: c.Labels,
+ Annotations: c.Annotations,
+ }).DeepCopy()
+}
+
type CAConfig struct {
MetaConfig
Validity time.Duration
Refresh time.Duration
}
+func (c *CAConfig) GetMetaSecret() *corev1.Secret {
+ return &corev1.Secret{
+ ObjectMeta: *c.GetObjectMeta(),
+ }
+}
+
type CABundleConfig struct {
MetaConfig
}
+func (c *CABundleConfig) GetMetaConfigMap() *corev1.ConfigMap {
+ return &corev1.ConfigMap{
+ ObjectMeta: *c.GetObjectMeta(),
+ }
+}
+
type CertificateConfig struct {
MetaConfig
Validity time.Duration
@@ -41,6 +61,57 @@ type CertificateConfig struct {
CertCreator ocrypto.CertCreator
}
+func (c *CertificateConfig) GetMetaSecret() *corev1.Secret {
+ return &corev1.Secret{
+ ObjectMeta: *c.GetObjectMeta(),
+ }
+}
+
+type CertChainConfig struct {
+ CAConfig *CAConfig
+ CABundleConfig *CABundleConfig
+ CertConfigs []*CertificateConfig
+}
+
+func (c *CertChainConfig) GetMetaSecrets() []*corev1.Secret {
+ secrets := make([]*corev1.Secret, 0, len(c.CertConfigs)+1)
+ secrets = append(secrets, c.CAConfig.GetMetaSecret())
+
+ for _, cc := range c.CertConfigs {
+ secrets = append(secrets, cc.GetMetaSecret())
+ }
+
+ return secrets
+}
+
+func (c *CertChainConfig) GetMetaConfigMaps() []*corev1.ConfigMap {
+ return []*corev1.ConfigMap{
+ c.CABundleConfig.GetMetaConfigMap(),
+ }
+}
+
+type CertChainConfigs []*CertChainConfig
+
+func (configs CertChainConfigs) GetMetaSecrets() []*corev1.Secret {
+ secrets := make([]*corev1.Secret, 0, len(configs)*2)
+
+ for _, c := range configs {
+ secrets = append(secrets, c.GetMetaSecrets()...)
+ }
+
+ return secrets
+}
+
+func (configs CertChainConfigs) GetMetaConfigMaps() []*corev1.ConfigMap {
+ configMaps := make([]*corev1.ConfigMap, 0, len(configs)*2)
+
+ for _, c := range configs {
+ configMaps = append(configMaps, c.GetMetaConfigMaps()...)
+ }
+
+ return configMaps
+}
+
type CertificateManager struct {
secretsClient corev1client.SecretsGetter
secretLister corev1listers.SecretLister
@@ -121,3 +192,7 @@ func (cm *CertificateManager) ManageCertificates(ctx context.Context, nowFunc fu
return nil
}
+
+func (cm *CertificateManager) ManageCertificateChain(ctx context.Context, nowFunc func() time.Time, controller *metav1.ObjectMeta, controllerGVK schema.GroupVersionKind, certChainConfig *CertChainConfig, existingSecrets map[string]*corev1.Secret, existingConfigMaps map[string]*corev1.ConfigMap) error {
+ return cm.ManageCertificates(ctx, nowFunc, controller, controllerGVK, certChainConfig.CAConfig, certChainConfig.CABundleConfig, certChainConfig.CertConfigs, existingSecrets, existingConfigMaps)
+}
diff --git a/pkg/naming/constants.go b/pkg/naming/constants.go
index b7e04352aba..3c6a216e5af 100644
--- a/pkg/naming/constants.go
+++ b/pkg/naming/constants.go
@@ -62,6 +62,8 @@ const (
NodeConfigNameLabel = "scylla-operator.scylladb.com/node-config-name"
ConfigMapTypeLabel = "scylla-operator.scylladb.com/config-map-type"
OwnerUIDLabel = "scylla-operator.scylladb.com/owner-uid"
+ ScyllaDBMonitoringNameLabel = "scylla-operator.scylladb.com/scylladbmonitoring-name"
+ ControllerNameLabel = "scylla-operator.scylladb.com/controller-name"
AppName = "scylla"
OperatorAppName = "scylla-operator"
diff --git a/pkg/resourceapply/apps.go b/pkg/resourceapply/apps.go
index 430ac03748a..c7f0f553999 100644
--- a/pkg/resourceapply/apps.go
+++ b/pkg/resourceapply/apps.go
@@ -72,3 +72,35 @@ func ApplyDaemonSet(
options,
)
}
+
+func ApplyDeploymentWithControl(
+ ctx context.Context,
+ control ApplyControlInterface[*appsv1.Deployment],
+ recorder record.EventRecorder,
+ required *appsv1.Deployment,
+ options ApplyOptions,
+) (*appsv1.Deployment, bool, error) {
+ return ApplyGeneric[*appsv1.Deployment](ctx, control, recorder, required, options)
+}
+
+func ApplyDeployment(
+ ctx context.Context,
+ client appsv1client.DeploymentsGetter,
+ lister appsv1listers.DeploymentLister,
+ recorder record.EventRecorder,
+ required *appsv1.Deployment,
+ options ApplyOptions,
+) (*appsv1.Deployment, bool, error) {
+ return ApplyDeploymentWithControl(
+ ctx,
+ ApplyControlFuncs[*appsv1.Deployment]{
+ GetCachedFunc: lister.Deployments(required.Namespace).Get,
+ CreateFunc: client.Deployments(required.Namespace).Create,
+ UpdateFunc: client.Deployments(required.Namespace).Update,
+ DeleteFunc: client.Deployments(required.Namespace).Delete,
+ },
+ recorder,
+ required,
+ options,
+ )
+}
diff --git a/pkg/resourceapply/generic.go b/pkg/resourceapply/generic.go
new file mode 100644
index 00000000000..74e5161cf70
--- /dev/null
+++ b/pkg/resourceapply/generic.go
@@ -0,0 +1,152 @@
+package resourceapply
+
+import (
+ "context"
+ "fmt"
+
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ "github.com/scylladb/scylla-operator/pkg/kubeinterfaces"
+ appsv1 "k8s.io/api/apps/v1"
+ corev1 "k8s.io/api/core/v1"
+ networkingv1 "k8s.io/api/networking/v1"
+ rbacv1 "k8s.io/api/rbac/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/client-go/tools/record"
+)
+
+type ApplyConfigUntyped struct {
+ Required kubeinterfaces.ObjectInterface
+ Options ApplyOptions
+ Control ApplyControlUntypedInterface
+}
+
+type ApplyConfig[T kubeinterfaces.ObjectInterface] struct {
+ Required T
+ Options ApplyOptions
+ Control ApplyControlFuncs[T]
+}
+
+func (ac ApplyConfig[T]) ToUntyped() ApplyConfigUntyped {
+ return ApplyConfigUntyped{
+ Required: ac.Required,
+ Options: ac.Options,
+ Control: ac.Control.ToUntyped(),
+ }
+}
+
+func ApplyFromConfig(
+ ctx context.Context,
+ cfg ApplyConfigUntyped,
+ recorder record.EventRecorder,
+) (kubeinterfaces.ObjectInterface, bool, error) {
+ return Apply(
+ ctx,
+ cfg.Required,
+ cfg.Control,
+ cfg.Options,
+ recorder,
+ )
+}
+
+func Apply(
+ ctx context.Context,
+ required kubeinterfaces.ObjectInterface,
+ control ApplyControlUntypedInterface,
+ options ApplyOptions,
+ recorder record.EventRecorder,
+) (kubeinterfaces.ObjectInterface, bool, error) {
+ switch metav1.Object(required).(type) {
+ case *corev1.Service:
+ return ApplyServiceWithControl(
+ ctx,
+ TypeApplyControlInterface[*corev1.Service](control),
+ recorder,
+ required.(*corev1.Service),
+ options,
+ )
+
+ case *corev1.ConfigMap:
+ return ApplyConfigMapWithControl(
+ ctx,
+ TypeApplyControlInterface[*corev1.ConfigMap](control),
+ recorder,
+ required.(*corev1.ConfigMap),
+ options,
+ )
+
+ case *corev1.Secret:
+ return ApplySecretWithControl(
+ ctx,
+ TypeApplyControlInterface[*corev1.Secret](control),
+ recorder,
+ required.(*corev1.Secret),
+ options,
+ )
+
+ case *corev1.ServiceAccount:
+ return ApplyServiceAccountWithControl(
+ ctx,
+ TypeApplyControlInterface[*corev1.ServiceAccount](control),
+ recorder,
+ required.(*corev1.ServiceAccount),
+ options,
+ )
+
+ case *rbacv1.RoleBinding:
+ return ApplyRoleBindingWithControl(
+ ctx,
+ TypeApplyControlInterface[*rbacv1.RoleBinding](control),
+ recorder,
+ required.(*rbacv1.RoleBinding),
+ options,
+ )
+
+ case *appsv1.Deployment:
+ return ApplyDeploymentWithControl(
+ ctx,
+ TypeApplyControlInterface[*appsv1.Deployment](control),
+ recorder,
+ required.(*appsv1.Deployment),
+ options,
+ )
+
+ case *networkingv1.Ingress:
+ return ApplyIngressWithControl(
+ ctx,
+ TypeApplyControlInterface[*networkingv1.Ingress](control),
+ recorder,
+ required.(*networkingv1.Ingress),
+ options,
+ )
+
+ case *monitoringv1.Prometheus:
+ return ApplyPrometheusWithControl(
+ ctx,
+ TypeApplyControlInterface[*monitoringv1.Prometheus](control),
+ recorder,
+ required.(*monitoringv1.Prometheus),
+ options,
+ )
+
+ case *monitoringv1.PrometheusRule:
+ return ApplyPrometheusRuleWithControl(
+ ctx,
+ TypeApplyControlInterface[*monitoringv1.PrometheusRule](control),
+ recorder,
+ required.(*monitoringv1.PrometheusRule),
+ options,
+ )
+
+ case *monitoringv1.ServiceMonitor:
+ return ApplyServiceMonitorWithControl(
+ ctx,
+ TypeApplyControlInterface[*monitoringv1.ServiceMonitor](control),
+ recorder,
+ required.(*monitoringv1.ServiceMonitor),
+ options,
+ )
+
+ default:
+ return nil, false, fmt.Errorf("no apply method matched for type %T", required)
+ }
+}
diff --git a/pkg/resourceapply/helpers.go b/pkg/resourceapply/helpers.go
index ddbe762eb0f..93e513cf255 100644
--- a/pkg/resourceapply/helpers.go
+++ b/pkg/resourceapply/helpers.go
@@ -203,6 +203,35 @@ func (acf ApplyControlFuncs[T]) ToUntyped() ApplyControlUntypedFuncs {
var _ ApplyControlInterface[*corev1.Service] = ApplyControlFuncs[*corev1.Service]{}
+func TypeApplyControlInterface[T kubeinterfaces.ObjectInterface](untyped ApplyControlUntypedInterface) ApplyControlInterface[T] {
+ return ApplyControlFuncs[T]{
+ GetCachedFunc: func(name string) (T, error) {
+ res, err := untyped.GetCached(name)
+ if res == nil {
+ return *new(T), err
+ }
+ return res.(T), err
+ },
+ CreateFunc: func(ctx context.Context, obj T, opts metav1.CreateOptions) (T, error) {
+ res, err := untyped.Create(ctx, obj, opts)
+ if res == nil {
+ return *new(T), err
+ }
+ return res.(T), err
+ },
+ UpdateFunc: func(ctx context.Context, obj T, opts metav1.UpdateOptions) (T, error) {
+ res, err := untyped.Update(ctx, obj, opts)
+ if res == nil {
+ return *new(T), err
+ }
+ return res.(T), err
+ },
+ DeleteFunc: func(ctx context.Context, name string, opts metav1.DeleteOptions) error {
+ return untyped.Delete(ctx, name, opts)
+ },
+ }
+}
+
type ApplyOptions struct {
ForceOwnership bool
AllowMissingControllerRef bool
diff --git a/pkg/resourceapply/monitoring.go b/pkg/resourceapply/monitoring.go
new file mode 100644
index 00000000000..c1547de2659
--- /dev/null
+++ b/pkg/resourceapply/monitoring.go
@@ -0,0 +1,106 @@
+package resourceapply
+
+import (
+ "context"
+
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
+ monitoringv1client "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/clientset/versioned/typed/monitoring/v1"
+ monitoringv1listers "github.com/scylladb/scylla-operator/pkg/externalclient/monitoring/listers/monitoring/v1"
+ "k8s.io/client-go/tools/record"
+)
+
+func ApplyPrometheusWithControl(
+ ctx context.Context,
+ control ApplyControlInterface[*monitoringv1.Prometheus],
+ recorder record.EventRecorder,
+ required *monitoringv1.Prometheus,
+ options ApplyOptions,
+) (*monitoringv1.Prometheus, bool, error) {
+ return ApplyGeneric[*monitoringv1.Prometheus](ctx, control, recorder, required, options)
+}
+
+func ApplyPrometheus(
+ ctx context.Context,
+ client monitoringv1client.PrometheusesGetter,
+ lister monitoringv1listers.PrometheusLister,
+ recorder record.EventRecorder,
+ required *monitoringv1.Prometheus,
+ options ApplyOptions,
+) (*monitoringv1.Prometheus, bool, error) {
+ return ApplyPrometheusWithControl(
+ ctx,
+ ApplyControlFuncs[*monitoringv1.Prometheus]{
+ GetCachedFunc: lister.Prometheuses(required.Namespace).Get,
+ CreateFunc: client.Prometheuses(required.Namespace).Create,
+ UpdateFunc: client.Prometheuses(required.Namespace).Update,
+ DeleteFunc: client.Prometheuses(required.Namespace).Delete,
+ },
+ recorder,
+ required,
+ options,
+ )
+}
+
+func ApplyPrometheusRuleWithControl(
+ ctx context.Context,
+ control ApplyControlInterface[*monitoringv1.PrometheusRule],
+ recorder record.EventRecorder,
+ required *monitoringv1.PrometheusRule,
+ options ApplyOptions,
+) (*monitoringv1.PrometheusRule, bool, error) {
+ return ApplyGeneric[*monitoringv1.PrometheusRule](ctx, control, recorder, required, options)
+}
+
+func ApplyPrometheusRule(
+ ctx context.Context,
+ client monitoringv1client.PrometheusRulesGetter,
+ lister monitoringv1listers.PrometheusRuleLister,
+ recorder record.EventRecorder,
+ required *monitoringv1.PrometheusRule,
+ options ApplyOptions,
+) (*monitoringv1.PrometheusRule, bool, error) {
+ return ApplyPrometheusRuleWithControl(
+ ctx,
+ ApplyControlFuncs[*monitoringv1.PrometheusRule]{
+ GetCachedFunc: lister.PrometheusRules(required.Namespace).Get,
+ CreateFunc: client.PrometheusRules(required.Namespace).Create,
+ UpdateFunc: client.PrometheusRules(required.Namespace).Update,
+ DeleteFunc: client.PrometheusRules(required.Namespace).Delete,
+ },
+ recorder,
+ required,
+ options,
+ )
+}
+
+func ApplyServiceMonitorWithControl(
+ ctx context.Context,
+ control ApplyControlInterface[*monitoringv1.ServiceMonitor],
+ recorder record.EventRecorder,
+ required *monitoringv1.ServiceMonitor,
+ options ApplyOptions,
+) (*monitoringv1.ServiceMonitor, bool, error) {
+ return ApplyGeneric[*monitoringv1.ServiceMonitor](ctx, control, recorder, required, options)
+}
+
+func ApplyServiceMonitor(
+ ctx context.Context,
+ client monitoringv1client.ServiceMonitorsGetter,
+ lister monitoringv1listers.ServiceMonitorLister,
+ recorder record.EventRecorder,
+ required *monitoringv1.ServiceMonitor,
+ options ApplyOptions,
+) (*monitoringv1.ServiceMonitor, bool, error) {
+ return ApplyServiceMonitorWithControl(
+ ctx,
+ ApplyControlFuncs[*monitoringv1.ServiceMonitor]{
+ GetCachedFunc: lister.ServiceMonitors(required.Namespace).Get,
+ CreateFunc: client.ServiceMonitors(required.Namespace).Create,
+ UpdateFunc: client.ServiceMonitors(required.Namespace).Update,
+ DeleteFunc: client.ServiceMonitors(required.Namespace).Delete,
+ },
+ recorder,
+ required,
+ options,
+ )
+}
diff --git a/pkg/scheme/scheme.go b/pkg/scheme/scheme.go
index 1c28a0240fe..35cae5f000b 100644
--- a/pkg/scheme/scheme.go
+++ b/pkg/scheme/scheme.go
@@ -2,6 +2,8 @@ package scheme
import (
scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1"
+ scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ monitoringv1 "github.com/scylladb/scylla-operator/pkg/externalapi/monitoring/v1"
cqlclientv1alpha1 "github.com/scylladb/scylla-operator/pkg/scylla/api/cqlclient/v1alpha1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/serializer"
@@ -29,5 +31,9 @@ func init() {
utilruntime.Must(kscheme.AddToScheme(Scheme))
utilruntime.Must(scyllav1.Install(Scheme))
+ utilruntime.Must(scyllav1alpha1.Install(Scheme))
+
utilruntime.Must(cqlclientv1alpha1.Install(Scheme))
+
+ utilruntime.Must(monitoringv1.Install(Scheme))
}
diff --git a/test/e2e/fixture/scylla/registry.go b/test/e2e/fixture/scylla/registry.go
index 24de5e4a1ba..6b618e8db9e 100644
--- a/test/e2e/fixture/scylla/registry.go
+++ b/test/e2e/fixture/scylla/registry.go
@@ -6,15 +6,25 @@ import (
o "github.com/onsi/gomega"
scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1"
scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
+ "github.com/scylladb/scylla-operator/pkg/assets"
"github.com/scylladb/scylla-operator/test/e2e/scheme"
+ "k8s.io/apimachinery/pkg/runtime"
)
+func ParseObjectTemplateOrDie[T runtime.Object](name, tmplString string) assets.ObjectTemplate[T] {
+ return assets.ParseObjectTemplateOrDie[T](name, tmplString, assets.TemplateFuncs, scheme.Codecs.UniversalDeserializer())
+}
+
var (
//go:embed "basic.scyllacluster.yaml"
BasicScyllaCluster ScyllaClusterBytes
//go:embed "nodeconfig.yaml"
NodeConfig NodeConfigBytes
+
+ //go:embed "scylladbmonitoring.yaml.tmpl"
+ scyllaDBMonitoringTemplateString string
+ ScyllaDBMonitoringTemplate = ParseObjectTemplateOrDie[*scyllav1alpha1.ScyllaDBMonitoring]("scylladbmonitoring", scyllaDBMonitoringTemplateString)
)
type ScyllaClusterBytes []byte
diff --git a/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl b/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl
new file mode 100644
index 00000000000..9a33d85b70d
--- /dev/null
+++ b/test/e2e/fixture/scylla/scylladbmonitoring.yaml.tmpl
@@ -0,0 +1,35 @@
+apiVersion: scylla.scylladb.com/v1alpha1
+kind: ScyllaDBMonitoring
+metadata:
+ name: "{{ .name }}"
+spec:
+ endpointsSelector:
+ matchLabels:
+ app.kubernetes.io/name: scylla
+ scylla-operator.scylladb.com/scylla-service-type: identity
+ scylla/cluster: "{{ .scyllaClusterName }}"
+ components:
+ prometheus:
+ exposeOptions:
+ webInterface:
+ ingress:
+ ingressClassName: haproxy
+ dnsDomains:
+ - "{{ .name }}-prometheus.{{ .namespace }}.apps.cluster.scylladb.com"
+ annotations:
+ haproxy-ingress.github.io/ssl-passthrough: "true"
+ storage:
+ volumeClaimTemplate:
+ spec:
+ resources:
+ requests:
+ storage: 1Gi
+ grafana:
+ exposeOptions:
+ webInterface:
+ ingress:
+ ingressClassName: haproxy
+ dnsDomains:
+ - "{{ .name }}-grafana.{{ .namespace }}.apps.cluster.scylladb.com"
+ annotations:
+ haproxy-ingress.github.io/ssl-passthrough: "true"
diff --git a/test/e2e/include.go b/test/e2e/include.go
index fd455857bf5..c57bae5ed30 100644
--- a/test/e2e/include.go
+++ b/test/e2e/include.go
@@ -5,4 +5,5 @@ package e2e
import (
_ "github.com/scylladb/scylla-operator/test/e2e/set/nodeconfig"
_ "github.com/scylladb/scylla-operator/test/e2e/set/scyllacluster"
+ _ "github.com/scylladb/scylla-operator/test/e2e/set/scylladbmonitoring"
)
diff --git a/test/e2e/set/scyllacluster/scyllacluster_tls.go b/test/e2e/set/scyllacluster/scyllacluster_tls.go
index 0087ba40536..f5ec7871491 100644
--- a/test/e2e/set/scyllacluster/scyllacluster_tls.go
+++ b/test/e2e/set/scyllacluster/scyllacluster_tls.go
@@ -23,6 +23,7 @@ import (
"github.com/scylladb/scylla-operator/test/e2e/framework"
"github.com/scylladb/scylla-operator/test/e2e/scheme"
"github.com/scylladb/scylla-operator/test/e2e/utils"
+ "github.com/scylladb/scylla-operator/test/e2e/verification"
corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -128,36 +129,36 @@ var _ = g.Describe("ScyllaCluster", func() {
clientCASecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-client-ca", sc.Name), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
- clientCACerts, _, _, _ := verifyAndParseTLSCert(clientCASecret, verifyTLSCertOptions{
- isCA: pointer.Bool(true),
- keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign),
+ clientCACerts, _, _, _ := verification.VerifyAndParseTLSCert(clientCASecret, verification.TLSCertOptions{
+ IsCA: pointer.Bool(true),
+ KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign),
})
o.Expect(clientCACerts).To(o.HaveLen(1))
servingCASecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-ca", sc.Name), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
- _, _, _, _ = verifyAndParseTLSCert(servingCASecret, verifyTLSCertOptions{
- isCA: pointer.Bool(true),
- keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign),
+ _, _, _, _ = verification.VerifyAndParseTLSCert(servingCASecret, verification.TLSCertOptions{
+ IsCA: pointer.Bool(true),
+ KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign),
})
servingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-ca", sc.Name), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
- servingCACerts, servingCACertBytes := verifyAndParseCABundle(servingCABundleConfigMap)
+ servingCACerts, servingCACertBytes := verification.VerifyAndParseCABundle(servingCABundleConfigMap)
o.Expect(servingCACerts).To(o.HaveLen(1))
servingCertSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-serving-certs", sc.Name), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
- servingCerts, _, _, _ := verifyAndParseTLSCert(servingCertSecret, verifyTLSCertOptions{
- isCA: pointer.Bool(false),
- keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature),
+ servingCerts, _, _, _ := verification.VerifyAndParseTLSCert(servingCertSecret, verification.TLSCertOptions{
+ IsCA: pointer.Bool(false),
+ KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature),
})
adminClientSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-user-admin", sc.Name), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
- _, adminClientCertBytes, _, adminClientKeyBytes := verifyAndParseTLSCert(adminClientSecret, verifyTLSCertOptions{
- isCA: pointer.Bool(false),
- keyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature),
+ _, adminClientCertBytes, _, adminClientKeyBytes := verification.VerifyAndParseTLSCert(adminClientSecret, verification.TLSCertOptions{
+ IsCA: pointer.Bool(false),
+ KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature),
})
adminClientConnectionConfigsSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-local-cql-connection-configs-admin", sc.Name), metav1.GetOptions{})
diff --git a/test/e2e/set/scyllacluster/verify.go b/test/e2e/set/scyllacluster/verify.go
index 78ab5e9d17f..a6fe8437099 100644
--- a/test/e2e/set/scyllacluster/verify.go
+++ b/test/e2e/set/scyllacluster/verify.go
@@ -2,14 +2,11 @@ package scyllacluster
import (
"context"
- "crypto"
- "crypto/x509"
"sort"
"strings"
o "github.com/onsi/gomega"
scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1"
- ocrypto "github.com/scylladb/scylla-operator/pkg/crypto"
"github.com/scylladb/scylla-operator/pkg/features"
"github.com/scylladb/scylla-operator/pkg/naming"
cqlclientv1alpha1 "github.com/scylladb/scylla-operator/pkg/scylla/api/cqlclient/v1alpha1"
@@ -286,45 +283,6 @@ func insertAndVerifyCQLData(ctx context.Context, hosts []string) *utils.DataInse
return di
}
-type verifyTLSCertOptions struct {
- isCA *bool
- keyUsage *x509.KeyUsage
-}
-
-func verifyAndParseTLSCert(secret *corev1.Secret, options verifyTLSCertOptions) ([]*x509.Certificate, []byte, crypto.PrivateKey, []byte) {
- o.Expect(secret.Type).To(o.Equal(corev1.SecretType("kubernetes.io/tls")))
- o.Expect(secret.Data).To(o.HaveKey("tls.crt"))
- o.Expect(secret.Data).To(o.HaveKey("tls.key"))
-
- certsBytes := secret.Data["tls.crt"]
- keyBytes := secret.Data["tls.key"]
- o.Expect(certsBytes).NotTo(o.BeEmpty())
- o.Expect(keyBytes).NotTo(o.BeEmpty())
-
- certs, key, err := ocrypto.GetTLSCertificatesFromBytes(certsBytes, keyBytes)
- o.Expect(err).NotTo(o.HaveOccurred())
-
- o.Expect(certs).NotTo(o.BeEmpty())
- o.Expect(certs[0].IsCA).To(o.Equal(*options.isCA))
- o.Expect(certs[0].KeyUsage).To(o.Equal(*options.keyUsage))
-
- o.Expect(key.Validate()).To(o.Succeed())
-
- return certs, certsBytes, key, keyBytes
-}
-
-func verifyAndParseCABundle(cm *corev1.ConfigMap) ([]*x509.Certificate, []byte) {
- o.Expect(cm.Data).To(o.HaveKey("ca-bundle.crt"))
-
- bundleBytes := cm.Data["ca-bundle.crt"]
- o.Expect(bundleBytes).NotTo(o.BeEmpty())
-
- certs, err := ocrypto.DecodeCertificates([]byte(bundleBytes))
- o.Expect(err).NotTo(o.HaveOccurred())
-
- return certs, []byte(bundleBytes)
-}
-
type verifyCQLConnectionConfigsOptions struct {
domains []string
datacenters []string
diff --git a/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go b/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go
new file mode 100644
index 00000000000..48bc61ff071
--- /dev/null
+++ b/test/e2e/set/scylladbmonitoring/scylladbmonitoring.go
@@ -0,0 +1,264 @@
+// Copyright (C) 2022 ScyllaDB
+
+package scylladbmonitoring
+
+import (
+ "context"
+ "crypto/tls"
+ "crypto/x509"
+ "fmt"
+ "net"
+ "net/http"
+ "net/url"
+ "time"
+
+ gapi "github.com/grafana/grafana-api-golang-client"
+ g "github.com/onsi/ginkgo/v2"
+ o "github.com/onsi/gomega"
+ prometheusappclient "github.com/prometheus/client_golang/api"
+ promeheusappv1api "github.com/prometheus/client_golang/api/prometheus/v1"
+ opointer "github.com/scylladb/scylla-operator/pkg/pointer"
+ scyllafixture "github.com/scylladb/scylla-operator/test/e2e/fixture/scylla"
+ "github.com/scylladb/scylla-operator/test/e2e/framework"
+ "github.com/scylladb/scylla-operator/test/e2e/utils"
+ "github.com/scylladb/scylla-operator/test/e2e/verification"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/utils/pointer"
+)
+
+var _ = g.Describe("ScyllaDBMonitoring", func() {
+ defer g.GinkgoRecover()
+
+ f := framework.NewFramework("scylladbmonitoring")
+
+ g.It("should setup monitoring stack", func() {
+ ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
+ defer cancel()
+
+ sc := scyllafixture.BasicScyllaCluster.ReadOrFail()
+ o.Expect(sc.Spec.Datacenter.Racks).To(o.HaveLen(1))
+ sc.Spec.Datacenter.Racks[0].Members = 1
+
+ framework.By("Creating a ScyllaCluster with a single node")
+ sc, err := f.ScyllaClient().ScyllaV1().ScyllaClusters(f.Namespace()).Create(
+ ctx,
+ sc,
+ metav1.CreateOptions{
+ FieldManager: f.FieldManager(),
+ FieldValidation: metav1.FieldValidationStrict,
+ },
+ )
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ framework.By("Creating a ScyllaDBMonitoring")
+ sm, _, err := scyllafixture.ScyllaDBMonitoringTemplate.RenderObject(map[string]string{
+ "name": sc.Name,
+ "namespace": sc.Namespace,
+ "scyllaClusterName": sc.Name,
+ })
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ sm, err = f.ScyllaClient().ScyllaV1alpha1().ScyllaDBMonitorings(sc.Namespace).Create(
+ ctx,
+ sm,
+ metav1.CreateOptions{
+ FieldManager: f.FieldManager(),
+ FieldValidation: metav1.FieldValidationStrict,
+ },
+ )
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ framework.By("Waiting for the ScyllaCluster to rollout (RV=%s)", sc.ResourceVersion)
+ waitCtx1, waitCtx1Cancel := utils.ContextForRollout(ctx, sc)
+ defer waitCtx1Cancel()
+ sc, err = utils.WaitForScyllaClusterState(waitCtx1, f.ScyllaClient().ScyllaV1(), sc.Namespace, sc.Name, utils.WaitForStateOptions{}, utils.IsScyllaClusterRolledOut)
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ framework.By("Waiting for the ScyllaDBMonitoring to rollout (RV=%s)", sm.ResourceVersion)
+ waitCtx2, waitCtx2Cancel := context.WithTimeout(ctx, 5*time.Minute)
+ defer waitCtx2Cancel()
+ sm, err = utils.WaitForScyllaDBMonitoringState(waitCtx2, f.ScyllaClient().ScyllaV1alpha1().ScyllaDBMonitorings(sc.Namespace), sc.Name, utils.WaitForStateOptions{}, utils.IsScyllaDBMonitoringRolledOut)
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ // We need to retry the prometheus and grafana assertion for several reasons, some of them are:
+ // - ingress exposure is asynchronous and some controllers don't report back status to wait for
+ // - prometheus configuration is asynchronous without any acknowledgement
+ // - grafana configuration is asynchronous without any acknowledgement
+ // Some of these may be fixable by manually verifying it in the operator sync loop so it can also be
+ // consumed by clients, but it's a bigger effort.
+
+ framework.By("Verifying that Prometheus is configured correctly")
+
+ prometheusServingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-prometheus-serving-ca", sm.Name), metav1.GetOptions{})
+ o.Expect(err).NotTo(o.HaveOccurred())
+ prometheusServingCACerts, _ := verification.VerifyAndParseCABundle(prometheusServingCABundleConfigMap)
+ o.Expect(prometheusServingCACerts).To(o.HaveLen(1))
+
+ prometheusServingCAPool := x509.NewCertPool()
+ prometheusServingCAPool.AddCert(prometheusServingCACerts[0])
+
+ prometheusGrafanaClientSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-prometheus-client-grafana", sm.Name), metav1.GetOptions{})
+ o.Expect(err).NotTo(o.HaveOccurred())
+ _, prometheusGrafanaClientCertBytes, _, prometheusGrafanaClientKeyBytes := verification.VerifyAndParseTLSCert(prometheusGrafanaClientSecret, verification.TLSCertOptions{
+ IsCA: pointer.Bool(false),
+ KeyUsage: opointer.KeyUsage(x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature),
+ })
+
+ prometheusGrafanaAdminTLSCert, err := tls.X509KeyPair(prometheusGrafanaClientCertBytes, prometheusGrafanaClientKeyBytes)
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ o.Expect(sm.Spec.Components.Prometheus.ExposeOptions.WebInterface.Ingress.DNSDomains).To(o.HaveLen(1))
+ prometheusServerName := sm.Spec.Components.Prometheus.ExposeOptions.WebInterface.Ingress.DNSDomains[0]
+
+ promHTTPClient, err := prometheusappclient.NewClient(prometheusappclient.Config{
+ Address: "https://" + f.GetIngressAddress(prometheusServerName),
+ Client: &http.Client{
+ Transport: &http.Transport{
+ TLSClientConfig: &tls.Config{
+ ServerName: prometheusServerName,
+ Certificates: []tls.Certificate{prometheusGrafanaAdminTLSCert},
+ RootCAs: prometheusServingCAPool,
+ },
+ Proxy: http.ProxyFromEnvironment,
+ DialContext: (&net.Dialer{
+ Timeout: 30 * time.Second,
+ KeepAlive: 30 * time.Second,
+ }).DialContext,
+ ForceAttemptHTTP2: true,
+ MaxIdleConns: 100,
+ IdleConnTimeout: 90 * time.Second,
+ TLSHandshakeTimeout: 10 * time.Second,
+ ExpectContinueTimeout: 1 * time.Second,
+ },
+ },
+ })
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ promClient := promeheusappv1api.NewAPI(promHTTPClient)
+
+ o.Eventually(func(eo o.Gomega) {
+ ctxTargets, ctxTargetsCancel := context.WithTimeout(ctx, 15*time.Second)
+ defer ctxTargetsCancel()
+
+ targets, err := promClient.Targets(ctxTargets)
+ framework.Infof("Listing grafana targets: err: %v, active: %d, dropped: %d", err, len(targets.Active), len(targets.Dropped))
+ eo.Expect(err).NotTo(o.HaveOccurred())
+
+ // This should match the number of rules in service monitors used. We can possibly extend this to compare those
+ // or wait to be able to assess that dropped targets are empty.
+ eo.Expect(targets.Active).To(o.HaveLen(2))
+ for _, t := range targets.Active {
+ eo.Expect(t.Health).To(o.Equal(promeheusappv1api.HealthGood))
+ }
+
+ // TODO: There shouldn't be any dropped targets. Currently, /service-discovery contains
+ // "undefined (0 / 54 active targets)" that are in addition to our ServiceMonitor definition.
+ // (Maciek was looking into this, it seems to be a bug in prometheus operator.)
+ // o.Expect(targets.Dropped).To(o.HaveLen(0))
+
+ rulesResult, err := promClient.Rules(ctxTargets)
+ framework.Infof("Listing grafana rules: err: %v, groupCount: %d", err, len(rulesResult.Groups))
+ eo.Expect(err).NotTo(o.HaveOccurred())
+
+ o.Expect(rulesResult.Groups).NotTo(o.HaveLen(0))
+ o.Expect(rulesResult.Groups[0].Name).To(o.Equal("scylla.rules"))
+ o.Expect(rulesResult.Groups[0].Rules).NotTo(o.BeEmpty())
+ for _, rule := range rulesResult.Groups[0].Rules {
+ switch rule.(type) {
+ case promeheusappv1api.AlertingRule:
+ o.Expect(rule.(promeheusappv1api.AlertingRule).Health).To(o.BeEquivalentTo(promeheusappv1api.RuleHealthGood))
+
+ case promeheusappv1api.RecordingRule:
+ o.Expect(rule.(promeheusappv1api.RecordingRule).Health).To(o.BeEquivalentTo(promeheusappv1api.RuleHealthGood))
+
+ default:
+ o.Expect(fmt.Errorf("unexpected rule type %t", rule)).NotTo(o.HaveOccurred())
+ }
+ }
+
+ }).WithTimeout(5 * time.Minute).WithPolling(1 * time.Second).Should(o.Succeed())
+
+ framework.By("Verifying that Grafana is configured correctly")
+
+ grafanaAdminCredentialsSecret, err := f.KubeClient().CoreV1().Secrets(f.Namespace()).Get(ctx, fmt.Sprintf("%s-grafana-admin-credentials", sc.Name), metav1.GetOptions{})
+ o.Expect(err).NotTo(o.HaveOccurred())
+ o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveLen(2))
+ o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveKey("username"))
+ o.Expect(grafanaAdminCredentialsSecret.Data).To(o.HaveKey("password"))
+
+ grafanaUsername := string(grafanaAdminCredentialsSecret.Data["username"])
+ o.Expect(grafanaUsername).NotTo(o.BeEmpty())
+ grafanaPassword := string(grafanaAdminCredentialsSecret.Data["password"])
+ o.Expect(grafanaPassword).NotTo(o.BeEmpty())
+
+ grafanaServingCABundleConfigMap, err := f.KubeClient().CoreV1().ConfigMaps(f.Namespace()).Get(ctx, fmt.Sprintf("%s-grafana-serving-ca", sc.Name), metav1.GetOptions{})
+ o.Expect(err).NotTo(o.HaveOccurred())
+ grafanaServingCACerts, _ := verification.VerifyAndParseCABundle(grafanaServingCABundleConfigMap)
+ o.Expect(grafanaServingCACerts).To(o.HaveLen(1))
+
+ grafanaServingCAPool := x509.NewCertPool()
+ grafanaServingCAPool.AddCert(grafanaServingCACerts[0])
+
+ o.Expect(sm.Spec.Components.Grafana.ExposeOptions.WebInterface.Ingress.DNSDomains).To(o.HaveLen(1))
+ grafanaServerName := sm.Spec.Components.Grafana.ExposeOptions.WebInterface.Ingress.DNSDomains[0]
+
+ grafanaClient, err := gapi.New(
+ "https://"+f.GetIngressAddress(grafanaServerName),
+ gapi.Config{
+ BasicAuth: url.UserPassword(grafanaUsername, grafanaPassword),
+ Client: &http.Client{
+ Transport: &http.Transport{
+ TLSClientConfig: &tls.Config{
+ ServerName: grafanaServerName,
+ RootCAs: grafanaServingCAPool,
+ },
+ Proxy: http.ProxyFromEnvironment,
+ DialContext: (&net.Dialer{
+ Timeout: 30 * time.Second,
+ KeepAlive: 30 * time.Second,
+ }).DialContext,
+ ForceAttemptHTTP2: true,
+ MaxIdleConns: 100,
+ IdleConnTimeout: 90 * time.Second,
+ TLSHandshakeTimeout: 10 * time.Second,
+ ExpectContinueTimeout: 1 * time.Second,
+ },
+ Timeout: 15 * time.Second,
+ },
+ },
+ )
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ expectedDashboards := []gapi.FolderDashboardSearchResponse{
+ {
+ ID: 2,
+ Title: "CQL Overview",
+ URI: "db/cql-overview",
+ Slug: "",
+ Type: "dash-db",
+ Tags: []string{},
+ IsStarred: false,
+ FolderID: 1,
+ FolderTitle: "scylladb",
+ },
+ }
+
+ var dashboards []gapi.FolderDashboardSearchResponse
+ o.Eventually(func(eo o.Gomega) {
+ dashboards, err = grafanaClient.Dashboards()
+ framework.Infof("Listing grafana dashboards: err: %v, count: %d", err, len(dashboards))
+ eo.Expect(err).NotTo(o.HaveOccurred())
+ eo.Expect(dashboards).To(o.HaveLen(len(expectedDashboards)))
+ }).WithTimeout(10 * time.Minute).WithPolling(1 * time.Second).Should(o.Succeed())
+
+ // Clear random fields for comparison.
+ for i := range dashboards {
+ d := &dashboards[i]
+ d.UID = ""
+ d.URL = ""
+ d.FolderUID = ""
+ d.FolderURL = ""
+ }
+ o.Expect(dashboards).To(o.Equal(expectedDashboards))
+ })
+})
diff --git a/test/e2e/utils/helpers.go b/test/e2e/utils/helpers.go
index df5ac4fddc2..8c4dbec4fb5 100644
--- a/test/e2e/utils/helpers.go
+++ b/test/e2e/utils/helpers.go
@@ -130,6 +130,24 @@ func IsScyllaClusterRolledOut(sc *scyllav1.ScyllaCluster) (bool, error) {
return true, nil
}
+func IsScyllaDBMonitoringRolledOut(sm *scyllav1alpha1.ScyllaDBMonitoring) (bool, error) {
+ if !helpers.IsStatusConditionPresentAndTrue(sm.Status.Conditions, scyllav1alpha1.AvailableCondition, sm.Generation) {
+ return false, nil
+ }
+
+ if !helpers.IsStatusConditionPresentAndFalse(sm.Status.Conditions, scyllav1alpha1.ProgressingCondition, sm.Generation) {
+ return false, nil
+ }
+
+ if !helpers.IsStatusConditionPresentAndFalse(sm.Status.Conditions, scyllav1alpha1.DegradedCondition, sm.Generation) {
+ return false, nil
+ }
+
+ framework.Infof("ScyllaDBMonitoring %s (RV=%s) is rolled out", klog.KObj(sm), sm.ResourceVersion)
+
+ return true, nil
+}
+
type listerWatcher[ListObject runtime.Object] interface {
List(context.Context, metav1.ListOptions) (ListObject, error)
Watch(context.Context, metav1.ListOptions) (watch.Interface, error)
@@ -200,6 +218,10 @@ func WaitForScyllaClusterState(ctx context.Context, client scyllav1client.Scylla
return WaitForObjectState[*scyllav1.ScyllaCluster, *scyllav1.ScyllaClusterList](ctx, client.ScyllaClusters(namespace), name, options, condition, additionalConditions...)
}
+func WaitForScyllaDBMonitoringState(ctx context.Context, client scyllav1alpha1client.ScyllaDBMonitoringInterface, name string, options WaitForStateOptions, condition func(monitoring *scyllav1alpha1.ScyllaDBMonitoring) (bool, error), additionalConditions ...func(monitoring *scyllav1alpha1.ScyllaDBMonitoring) (bool, error)) (*scyllav1alpha1.ScyllaDBMonitoring, error) {
+ return WaitForObjectState[*scyllav1alpha1.ScyllaDBMonitoring, *scyllav1alpha1.ScyllaDBMonitoringList](ctx, client, name, options, condition, additionalConditions...)
+}
+
func WaitForPodState(ctx context.Context, client corev1client.PodInterface, name string, options WaitForStateOptions, condition func(*corev1.Pod) (bool, error), additionalConditions ...func(*corev1.Pod) (bool, error)) (*corev1.Pod, error) {
return WaitForObjectState[*corev1.Pod, *corev1.PodList](ctx, client, name, options, condition, additionalConditions...)
}
diff --git a/test/e2e/verification/certs.go b/test/e2e/verification/certs.go
new file mode 100644
index 00000000000..5df9ad35c03
--- /dev/null
+++ b/test/e2e/verification/certs.go
@@ -0,0 +1,49 @@
+package verification
+
+import (
+ "crypto"
+ "crypto/x509"
+
+ o "github.com/onsi/gomega"
+ ocrypto "github.com/scylladb/scylla-operator/pkg/crypto"
+ corev1 "k8s.io/api/core/v1"
+)
+
+type TLSCertOptions struct {
+ IsCA *bool
+ KeyUsage *x509.KeyUsage
+}
+
+func VerifyAndParseTLSCert(secret *corev1.Secret, options TLSCertOptions) ([]*x509.Certificate, []byte, crypto.PrivateKey, []byte) {
+ o.Expect(secret.Type).To(o.Equal(corev1.SecretType("kubernetes.io/tls")))
+ o.Expect(secret.Data).To(o.HaveKey("tls.crt"))
+ o.Expect(secret.Data).To(o.HaveKey("tls.key"))
+
+ certsBytes := secret.Data["tls.crt"]
+ keyBytes := secret.Data["tls.key"]
+ o.Expect(certsBytes).NotTo(o.BeEmpty())
+ o.Expect(keyBytes).NotTo(o.BeEmpty())
+
+ certs, key, err := ocrypto.GetTLSCertificatesFromBytes(certsBytes, keyBytes)
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ o.Expect(certs).NotTo(o.BeEmpty())
+ o.Expect(certs[0].IsCA).To(o.Equal(*options.IsCA))
+ o.Expect(certs[0].KeyUsage).To(o.Equal(*options.KeyUsage))
+
+ o.Expect(key.Validate()).To(o.Succeed())
+
+ return certs, certsBytes, key, keyBytes
+}
+
+func VerifyAndParseCABundle(cm *corev1.ConfigMap) ([]*x509.Certificate, []byte) {
+ o.Expect(cm.Data).To(o.HaveKey("ca-bundle.crt"))
+
+ bundleBytes := cm.Data["ca-bundle.crt"]
+ o.Expect(bundleBytes).NotTo(o.BeEmpty())
+
+ certs, err := ocrypto.DecodeCertificates([]byte(bundleBytes))
+ o.Expect(err).NotTo(o.HaveOccurred())
+
+ return certs, []byte(bundleBytes)
+}