diff --git a/docs/node-mixin/Makefile b/docs/node-mixin/Makefile index d04b37d009..d1775a9828 100644 --- a/docs/node-mixin/Makefile +++ b/docs/node-mixin/Makefile @@ -6,15 +6,15 @@ fmt: find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNET_FMT) -i -node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*) - jsonnet -S alerts.jsonnet > $@ +node_alerts.yaml: mixin.libsonnet lib/linux/config.libsonnet $(wildcard lib/linux/alerts/*) + jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > $@ -node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*) - jsonnet -S rules.jsonnet > $@ +node_rules.yaml: mixin.libsonnet lib/linux/config.libsonnet $(wildcard lib/linux/rules/*) + jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' > $@ -dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) - @mkdir -p dashboards_out - jsonnet -J vendor -m dashboards_out dashboards.jsonnet +dashboards_out: mixin.libsonnet lib/linux/config.libsonnet lib/linux/dashboards.libsonnet $(wildcard lib/linux/*) + @mkdir -p dashboards_out/linux + jsonnet -J vendor -m dashboards_out/linux -e '(import "mixin.libsonnet").grafanaDashboards' lint: node_alerts.yaml node_rules.yaml find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ diff --git a/docs/node-mixin/alerts.jsonnet b/docs/node-mixin/alerts.jsonnet deleted file mode 100644 index 75e7c1b297..0000000000 --- a/docs/node-mixin/alerts.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet deleted file mode 100644 index 67e71d140b..0000000000 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ /dev/null @@ -1,419 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'node-exporter-filesystem', - rules: [ - { - alert: 'NodeFilesystemSpaceFillingUp', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d - and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem is predicted to run out of space within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', - }, - }, - { - alert: 'NodeFilesystemSpaceFillingUp', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d - and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem is predicted to run out of space within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfSpace', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '30m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfSpace', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '30m', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', - }, - }, - { - alert: 'NodeFilesystemFilesFillingUp', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 40 - and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', - }, - }, - { - alert: 'NodeFilesystemFilesFillingUp', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 20 - and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfFiles', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 5 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem has less than 5% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfFiles', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 3 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem has less than 3% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', - }, - }, - ], - }, - { - name: 'node-exporter', - rules: [ - { - alert: 'NodeNetworkReceiveErrs', - expr: ||| - rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many receive errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', - }, - }, - { - alert: 'NodeNetworkTransmitErrs', - expr: ||| - rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many transmit errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', - }, - }, - { - alert: 'NodeHighNumberConntrackEntriesUsed', - expr: ||| - (node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75 - ||| % $._config, - annotations: { - summary: 'Number of conntrack are getting close to the limit.', - description: '{{ $value | humanizePercentage }} of conntrack entries are used.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeTextFileCollectorScrapeError', - expr: ||| - node_textfile_scrape_error{%(nodeExporterSelector)s} == 1 - ||| % $._config, - annotations: { - summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeClockSkewDetected', - expr: ||| - ( - node_timex_offset_seconds{%(nodeExporterSelector)s} > 0.05 - and - deriv(node_timex_offset_seconds{%(nodeExporterSelector)s}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{%(nodeExporterSelector)s} < -0.05 - and - deriv(node_timex_offset_seconds{%(nodeExporterSelector)s}[5m]) <= 0 - ) - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock skew detected.', - description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', - }, - }, - { - alert: 'NodeClockNotSynchronising', - expr: ||| - min_over_time(node_timex_sync_status{%(nodeExporterSelector)s}[5m]) == 0 - and - node_timex_maxerror_seconds{%(nodeExporterSelector)s} >= 16 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock not synchronising.', - description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', - }, - }, - { - alert: 'NodeRAIDDegraded', - expr: ||| - node_md_disks_required{%(nodeExporterSelector)s,%(diskDeviceSelector)s} - ignoring (state) (node_md_disks{state="active",%(nodeExporterSelector)s,%(diskDeviceSelector)s}) > 0 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'RAID Array is degraded.', - description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", - }, - }, - { - alert: 'NodeRAIDDiskFailure', - expr: ||| - node_md_disks{state="failed",%(nodeExporterSelector)s,%(diskDeviceSelector)s} > 0 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Failed device in RAID array.', - description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(nodeExporterSelector)s} * 100 / node_filefd_maximum{%(nodeExporterSelector)s} > 70 - ) - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(nodeExporterSelector)s} * 100 / node_filefd_maximum{%(nodeExporterSelector)s} > 90 - ) - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeCPUHighUsage', - expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'info', - }, - annotations: { - summary: 'High CPU usage.', - description: ||| - CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. - ||| % $._config, - }, - }, - { - alert: 'NodeSystemSaturation', - expr: ||| - node_load1{%(nodeExporterSelector)s} - / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'System saturated, load per core is very high.', - description: ||| - System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This might indicate this instance resources saturation and can cause it becoming unresponsive. - ||| % $._config, - }, - }, - { - alert: 'NodeMemoryMajorPagesFaults', - expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Memory major page faults are occurring at very high rate.', - description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - Please check that there is enough memory available at this instance. - ||| % $._config, - }, - }, - { - alert: 'NodeMemoryHighUtilization', - expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Host is running out of memory.', - description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. - ||| % $._config, - }, - }, - { - alert: 'NodeDiskIOSaturation', - expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d - ||| % $._config, - 'for': '30m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Disk IO queue is high.', - description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This symptom might indicate disk saturation. - ||| % $._config, - }, - }, - { - alert: 'NodeSystemdServiceFailed', - expr: ||| - node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Systemd service has entered failed state.', - description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', - }, - }, - ], - }, - ], - }, -} diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet deleted file mode 100644 index 86f874c257..0000000000 --- a/docs/node-mixin/config.libsonnet +++ /dev/null @@ -1,117 +0,0 @@ -{ - _config+:: { - // Selectors are inserted between {} in Prometheus queries. - - // Select the metrics coming from the node exporter. Note that all - // the selected metrics are shown stacked on top of each other in - // the 'USE Method / Cluster' dashboard. Consider disabling that - // dashboard if mixing up all those metrics in the same dashboard - // doesn't make sense (e.g. because they are coming from different - // clusters). - nodeExporterSelector: 'job="node"', - - // Select the fstype for filesystem-related queries. If left - // empty, all filesystems are selected. If you have unusual - // filesystem you don't want to include in dashboards and - // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. - fsSelector: 'fstype!=""', - - // Select the mountpoint for filesystem-related queries. If left - // empty, all mountpoints are selected. For example if you have a - // special purpose tmpfs instance that has a fixed size and will - // always be 100% full, but you still want alerts and dashboards for - // other tmpfs instances, you can exclude those by mountpoint prefix - // like so: 'mountpoint!~"/var/lib/foo.*"'. - fsMountpointSelector: 'mountpoint!=""', - - // Select the device for disk-related queries. If left empty, all - // devices are selected. If you have unusual devices you don't - // want to include in dashboards and alerting, you can exclude - // them here, e.g. 'device!="tmpfs"'. - diskDeviceSelector: 'device!=""', - - // Some of the alerts are meant to fire if a critical failure of a - // node is imminent (e.g. the disk is about to run full). In a - // true “cloud native” setup, failures of a single node should be - // tolerated. Hence, even imminent failure of a single node is no - // reason to create a paging alert. However, in practice there are - // still many situations where operators like to get paged in time - // before a node runs out of disk space. nodeCriticalSeverity can - // be set to the desired severity for this kind of alerts. This - // can even be templated to depend on labels of the node, e.g. you - // could make this critical for traditional database masters but - // just a warning for K8s nodes. - nodeCriticalSeverity: 'critical', - - // CPU utilization (%) on which to trigger the - // 'NodeCPUHighUsage' alert. - cpuHighUsageThreshold: 90, - // Load average 1m (per core) on which to trigger the - // 'NodeSystemSaturation' alert. - systemSaturationPerCoreThreshold: 2, - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk - // usage grows in a way that it is predicted to run out in 4h or 1d - // and if the provided thresholds have been reached right now. - // In some cases you'll want to adjust these, e.g. by default Kubernetes - // runs the image garbage collection when the disk usage reaches 85% - // of its available space. In that case, you'll want to reduce the - // critical threshold below to something like 14 or 15, otherwise - // the alert could fire under normal node usage. - fsSpaceFillingUpWarningThreshold: 40, - fsSpaceFillingUpCriticalThreshold: 20, - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemAlmostOutOfSpace' alerts. - fsSpaceAvailableWarningThreshold: 5, - fsSpaceAvailableCriticalThreshold: 3, - - // Memory utilzation (%) level on which to trigger the - // 'NodeMemoryHighUtilization' alert. - memoryHighUtilizationThreshold: 90, - - // Threshold for the rate of memory major page faults to trigger - // 'NodeMemoryMajorPagesFaults' alert. - memoryMajorPagesFaultsThreshold: 500, - - // Disk IO queue level above which to trigger - // 'NodeDiskIOSaturation' alert. - diskIOSaturationThreshold: 10, - - rateInterval: '5m', - // Opt-in for multi-cluster support. - showMultiCluster: false, - - clusterLabel: 'cluster', - - // groupLabels is a string with comma-separated - // labels that are common labels of instances belonging to the - // same logical group. Include not only enough labels to - // identify cluster members, but also all common labels you want - // to keep for resulting cluster-level alerts. - groupLabels: 'job', - // commaSeparated list of labels identifying a single instance: - instanceLabels: 'instance', - - dashboardNamePrefix: 'Node Exporter / ', - dashboardTags: ['node-exporter-mixin'], - dashboardRefresh: '30s', - dashboardTimezone: 'utc', - dashboardInterval: 'now-2h', - - // Grafana dashboard IDs are necessary for stable links for dashboards - grafanaDashboardIDs: { - 'node-rsrc-use.json': 'node-rsrc-use', - 'node-cluster-rsrc-use.json': 'node-cluster-rsrc-use', - 'node-multicluster-rsrc-use.json': 'node-multicluster-rsrc-use', - 'nodes.json': 'nodes', - 'nodes-darwin.json': 'nodes-darwin', - 'nodes-system.json': 'node-system', - 'nodes-memory.json': 'node-memory', - 'nodes-network.json': 'node-network', - 'nodes-disk.json': 'node-disk', - 'nodes-fleet.json': 'node-fleet', - }, - }, -} diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet deleted file mode 100644 index 9d913ed3f1..0000000000 --- a/docs/node-mixin/dashboards.jsonnet +++ /dev/null @@ -1,6 +0,0 @@ -local dashboards = (import 'mixin.libsonnet').grafanaDashboards; - -{ - [name]: dashboards[name] - for name in std.objectFields(dashboards) -} diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index e6adbd4fa0..0000000000 --- a/docs/node-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'node.libsonnet') + -(import 'use.libsonnet') diff --git a/docs/node-mixin/dashboards/disk.libsonnet b/docs/node-mixin/dashboards/disk.libsonnet deleted file mode 100644 index 2f78c4da3e..0000000000 --- a/docs/node-mixin/dashboards/disk.libsonnet +++ /dev/null @@ -1,165 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - // https://www.robustperception.io/filesystem-metrics-from-the-node-exporter/ - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local fsAvailable = - nodeTimeseries.new( - 'Filesystem Space Available', - description=||| - Filesystem space utilisation in bytes, by mountpoint. - ||| - ) - .withUnits('decbytes') - .withFillOpacity(5) - .addTarget(commonPromTarget( - expr=q.node_filesystem_avail_bytes, - legendFormat='{{ mountpoint }}', - )), - - local fsInodes = - nodeTimeseries.new( - 'Free inodes', - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', - ) - .withUnits('short') - .addTarget(commonPromTarget( - expr=q.node_filesystem_files_free, - legendFormat='{{ mountpoint }}' - )) - .addTarget(commonPromTarget( - expr=q.node_filesystem_files, - legendFormat='{{ mountpoint }}' - )), - local fsInodesTotal = - nodeTimeseries.new( - 'Total inodes', - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', - ) - .withUnits('short') - .addTarget(commonPromTarget( - expr=q.node_filesystem_files, - legendFormat='{{ mountpoint }}' - )), - local fsErrorsandRO = - nodeTimeseries.new('Filesystems with errors / read-only') - .withMax(1) - .addTarget(commonPromTarget( - expr=q.node_filesystem_readonly, - legendFormat='{{ mountpoint }}' - )) - .addTarget(commonPromTarget( - expr=q.node_filesystem_device_error, - legendFormat='{{ mountpoint }}' - )), - local fileDescriptors = - nodeTimeseries.new( - 'File Descriptors', - description=||| - File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. - The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. - ||| - ) - .addTarget(commonPromTarget( - expr=q.process_max_fds, - legendFormat='Maximum open file descriptors', - )) - .addTarget(commonPromTarget( - expr=q.process_open_fds, - legendFormat='Open file descriptors', - )), - - local diskIOcompleted = - nodeTimeseries.new( - title='Disk IOps completed', - description='The number (after merges) of I/O requests completed per second for the device' - ) - .withUnits('iops') - .withNegativeYByRegex('reads') - .withAxisLabel('read(-) | write(+)') - .addTarget(commonPromTarget( - expr=q.node_disk_reads_completed_total, - legendFormat='{{device}} reads completed', - )) - .addTarget(commonPromTarget( - expr=q.node_disk_writes_completed_total, - legendFormat='{{device}} writes completed', - )), - - local diskAvgWaitTime = - nodeTimeseries.new( - title='Disk Average Wait Time', - description='The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.' - ) - .withUnits('s') - .withNegativeYByRegex('read') - .withAxisLabel('read(-) | write(+)') - .addTarget(commonPromTarget( - expr=q.diskWaitReadTime, - legendFormat='{{device}} read wait time avg', - )) - .addTarget(commonPromTarget( - expr=q.diskWaitWriteTime, - legendFormat='{{device}} write wait time avg', - )), - - local diskAvgQueueSize = - nodeTimeseries.new( - title='Average Queue Size (aqu-sz)', - description='The average queue length of the requests that were issued to the device.' - ) - .addTarget(commonPromTarget( - expr=q.diskAvgQueueSize, - legendFormat='{{device}}', - )), - - local panelsGrid = - [ - { type: 'row', title: 'Filesystem', gridPos: { y: 0 } }, - fsAvailable { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - fsInodes { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - fsInodesTotal { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - fsErrorsandRO { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - fileDescriptors { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - { type: 'row', title: 'Disk', gridPos: { y: 25 } }, - c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - diskIOcompleted { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - diskAvgWaitTime { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - diskAvgQueueSize { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Filesystem and Disk' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-disk.json'] - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/fleet.libsonnet b/docs/node-mixin/dashboards/fleet.libsonnet deleted file mode 100644 index a9939e59e2..0000000000 --- a/docs/node-mixin/dashboards/fleet.libsonnet +++ /dev/null @@ -1,505 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - - - local templates = [ - if std.member(std.split(config.instanceLabels, ','), template.name) - then - template - { - allValue: '.+', - includeAll: true, - multi: true, - } - else template - for template in c.templates - ], - - local q = c.queries, - - local fleetTable = - nodePanels.table.new( - title='Linux Nodes Overview' - ) - .addTarget(commonPromTarget(expr=q.osInfo, format='table', instant=true) { refId: 'INFO' }) - .addTarget(commonPromTarget(expr=q.nodeInfo, format='table', instant=true) { refId: 'OS' }) - .addTarget(commonPromTarget(expr=q.uptime, format='table', instant=true) { refId: 'UPTIME' }) - .addTarget(commonPromTarget(expr=q.systemLoad1, format='table', instant=true) { refId: 'LOAD1' }) - .addTarget(commonPromTarget(expr=q.systemLoad5, format='table', instant=true) { refId: 'LOAD5' }) - .addTarget(commonPromTarget(expr=q.systemLoad15, format='table', instant=true) { refId: 'LOAD15' }) - .addTarget(commonPromTarget( - expr=q.cpuCount, - format='table', - instant=true, - ) { refId: 'CPUCOUNT' }) - .addTarget(commonPromTarget( - expr=q.cpuUsage, format='table', instant=true, - ) { refId: 'CPUUSAGE' }) - .addTarget(commonPromTarget(expr=q.memoryTotal, format='table', instant=true) { refId: 'MEMTOTAL' }) - .addTarget(commonPromTarget(expr=q.memoryUsage, format='table', instant=true) { refId: 'MEMUSAGE' }) - .addTarget(commonPromTarget(expr=q.fsSizeTotalRoot, format='table', instant=true) { refId: 'FSTOTAL' }) - .addTarget(commonPromTarget( - expr= - ||| - 100-(max by (%(instanceLabels)s) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) - / - max by (%(instanceLabels)s) (node_filesystem_size_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) * 100) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - ) { refId: 'FSUSAGE' }) - .addTarget(commonPromTarget( - expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true - ) { refId: 'CRITICAL' }) - .addTarget(commonPromTarget( - expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true - ) { refId: 'WARNING' }) - .withTransform() - .joinByField(field=std.split(config.instanceLabels, ',')[0]) - .filterFieldsByName(std.split(config.instanceLabels, ',')[0] + '|nodename|Value.+') - .organize( - excludeByName={ - 'Value #OS': true, - 'Value #INFO': true, - 'Value #LOAD5': true, - 'Value #LOAD15': true, - }, - renameByName={ - instance: 'Instance', - pretty_name: 'OS', - nodename: 'Hostname', - release: 'Kernel version', - 'Value #LOAD1': 'Load 1m', - 'Value #LOAD5': 'Load 5m', - 'Value #LOAD15': 'Load 15m', - 'Value #CPUCOUNT': 'Cores', - 'Value #CPUUSAGE': 'CPU usage', - 'Value #MEMTOTAL': 'Memory total', - 'Value #MEMUSAGE': 'Memory usage', - 'Value #FSTOTAL': 'Root disk size', - 'Value #FSUSAGE': 'Root disk usage', - 'Value #UPTIME': 'Uptime', - 'Value #CRITICAL': 'Crit Alerts', - 'Value #WARNING': 'Warnings', - } - ) - .withFooter(reducer=['mean'], fields=[ - 'Value #LOAD1', - 'Value #MEMUSAGE', - 'Value #CPUUSAGE', - ]) - .addThresholdStep(color='light-blue', value=null) - .addThresholdStep(color='light-yellow', value=80) - .addThresholdStep(color='light-red', value=90) - .addOverride( - matcher={ - id: 'byName', - options: 'Instance', - }, - properties=[ - { - id: 'links', - value: [ - { - targetBlank: true, - title: c.links.instanceDataLinkForTable.title, - url: c.links.instanceDataLinkForTable.url, - }, - ], - }, - { - id: 'custom.filterable', - value: true, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'OS|Kernel version|Hostname', - }, - properties=[ - { - id: 'custom.filterable', - value: true, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Memory total|Root disk size', - }, - properties=[ - { - id: 'unit', - value: 'bytes', - }, - { - id: 'decimals', - value: 0, - }, - ] - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Cores', - }, - properties=[ - { - id: 'custom.width', - value: 60, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Load.+', - }, - properties=[ - { - id: 'custom.width', - value: 60, - }, - ] - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Uptime', - }, - properties=[ - { - id: 'unit', - value: 'dtdurations', - }, - { - id: 'custom.displayMode', - value: 'color-text', - }, - { - id: 'thresholds', - value: { - mode: 'absolute', - steps: [ - { - color: 'light-orange', - value: null, - }, - { - color: 'text', - value: 300, - }, - ], - }, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'CPU usage|Memory usage|Root disk usage', - }, - properties=[ - { - id: 'unit', - value: 'percent', - }, - // { - // id: 'custom.displayMode', - // value: 'gradient-gauge', - // }, - { - id: 'custom.displayMode', - value: 'basic', - }, - { - id: 'max', - value: 100, - }, - { - id: 'min', - value: 0, - }, - ] - ) - .sortBy('Instance') - , - - local memoryUsagePanel = - nodePanels.timeseries.new('Memory Usage', description='Top 25') - .withUnits('percent') - .withMin(0) - .withMax(100) - .withColor(mode='continuous-BlYlRd') - .withFillOpacity(1) - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.memoryUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) - )) - .addTarget(commonPromTarget( - expr='avg(' + q.memoryUsage + ')', - legendFormat='Mean', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - - local cpuUsagePanel = - nodePanels.timeseries.new('CPU Usage', description='Top 25') - .withUnits('percent') - .withMin(0) - .withMax(100) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.cpuUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')), - )) - .addTarget(commonPromTarget( - expr='avg(' + q.cpuUsage + ')', - legendFormat='Mean', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - - local diskIOPanel = - nodePanels.timeseries.new('Disks I/O', description='Top 25') - .withUnits('percentunit') - .withMin(0) - .withMax(1) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.diskIoTime + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - local diskSpacePanel = - nodePanels.timeseries.new('Disks Space Usage', description='Top 25') - .withUnits('percentunit') - .withMin(0) - .withMax(1) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.diskSpaceUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{mountpoint}}', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - local networkErrorsDropsPanel = - nodePanels.timeseries.new('Network Errors and Dropped Packets', description='Top 25') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addTarget(commonPromTarget( - expr='topk(25, ' + q.networkReceiveErrorsPerSec + ' + ' + q.networkTransmitErrorsPerSec + ' + ' + q.networkReceiveDropsPerSec + ' + ' + q.networkTransmitDropsPerSec + ') > 0.5', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', - )) - .withDecimals(1) - .withUnits('pps') - .withDrawStyle('points') - .withPointsSize(5) - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ), - - local rows = - [ - row.new('Overview') - .addPanel(fleetTable { span: 12, height: '800px' }) - .addPanel(cpuUsagePanel { span: 12 }) - .addPanel(memoryUsagePanel { span: 12 }) - .addPanel(diskIOPanel { span: 6 }).addPanel(diskSpacePanel { span: 6 }) - .addPanel(networkErrorsDropsPanel { span: 12 }), - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Fleet Overview' % config.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-fleet.json'], - ) - .addLink(c.links.otherDashes { includeVars: false }) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addRows(rows) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/memory.libsonnet b/docs/node-mixin/dashboards/memory.libsonnet deleted file mode 100644 index 5b6e613851..0000000000 --- a/docs/node-mixin/dashboards/memory.libsonnet +++ /dev/null @@ -1,406 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local memoryPagesInOut = - nodeTimeseries.new( - 'Memory Pages In / Out', - description=||| - Page-In - Return of pages to physical memory. This is a common and normal event. - - Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. - When the kernel detects low memory, it attempts to free memory by paging out. - While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. - Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. - ||| - ) - .withNegativeYByRegex('out') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgpgin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Page-In' - )) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgpgout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Page-Out' - )), - local memoryPagesSwapInOut = - nodeTimeseries.new( - 'Memory Pages Swapping In / Out', - description=||| - Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. - Nonetheless, it is a preferable option to crashing or killing off processes. - - The process of writing pages out to disk to free memory is known as swapping-out. - If a page fault occurs because the page is on disk, in the swap area, rather than in memory, - the kernel will read the page back in from the disk to satisfy the page fault. - This is known as swapping-in. - ||| - ) - .withNegativeYByRegex('out') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pswpin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Pages swapped in' - )) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pswpout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Pages swapped out' - )), - - local memoryPagesFaults = - nodeTimeseries.new( - 'Memory Page Faults', - description=||| - A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, - requiring a mapping to be added to the process's virtual address space. The page contents may also need to be loaded from a backing store such as a disk. - While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. - Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. - ||| - ) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Major page fault operations' - )) - .addTarget(commonPromTarget( - expr= - ||| - irate(node_vmstat_pgfault{%(nodeQuerySelector)s}[$__rate_interval]) - - - irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Minor page fault operations' - )), - - local memoryOOMkiller = - nodeTimeseries.new( - 'OOM Killer', - description=||| - Out Of Memory Killer is a process used by the Linux kernel when the system is running critically low on memory. - This can happen when the kernel has allocated more memory than is available for its processes. - ||| - ) - .addTarget(commonPromTarget( - expr='increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval] offset -$__interval)' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='OOM killer invocations' - )), - - local memoryActiveInactive = - nodeTimeseries.new( - 'Memory Active / Inactive', - description=||| - Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. - Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Inactive_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active', - )), - - local memoryActiveInactiveDetail = - nodeTimeseries.new( - 'Memory Active / Inactive Details', - description=||| - Inactive_file: File-backed memory on inactive LRU list. - Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). - Active_file: File-backed memory on active LRU list. - Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Inactive_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive_file', - )) - .addTarget(commonPromTarget( - expr='node_memory_Inactive_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive_anon', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active_file', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active_anon', - )), - - local memoryCommited = - nodeTimeseries.new( - 'Memory Commited', - description=||| - Committed_AS - Amount of memory presently allocated on the system. - CommitLimit - Amount of memory currently available to be allocated on the system. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Committed_AS_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Committed_AS' - )) - .addTarget(commonPromTarget( - expr='node_memory_CommitLimit_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CommitLimit' - )), - local memorySharedAndMapped = - nodeTimeseries.new( - 'Memory Shared and Mapped', - description=||| - Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. - Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. - ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. - ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Mapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Mapped' - )) - .addTarget(commonPromTarget( - expr='node_memory_Shmem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Shmem' - )) - .addTarget(commonPromTarget( - expr='node_memory_ShmemHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ShmemHugePages' - )) - .addTarget(commonPromTarget( - expr='node_memory_ShmemPmdMapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ShmemPmdMapped' - )), - - local memoryWriteAndDirty = - nodeTimeseries.new( - 'Memory Writeback and Dirty', - description=||| - Writeback: This refers to the memory that is currently being actively written back to the disk. - WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. - Dirty: This type of memory is waiting to be written back to the disk. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Writeback_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Writeback' - )) - .addTarget(commonPromTarget( - expr='node_memory_WritebackTmp_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='WritebackTmp' - )) - .addTarget(commonPromTarget( - expr='node_memory_Dirty_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Dirty' - )), - - local memoryVmalloc = - nodeTimeseries.new( - 'Memory Vmalloc', - description=||| - Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. - - VmallocChunk: Largest contiguous block of vmalloc area which is free. - VmallocTotal: Total size of vmalloc memory area. - VmallocUsed: Amount of vmalloc area which is used. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_VmallocChunk_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocChunk' - )) - .addTarget(commonPromTarget( - expr='node_memory_VmallocTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocTotal' - )) - .addTarget(commonPromTarget( - expr='node_memory_VmallocUsed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocUsed' - )), - - local memorySlab = - nodeTimeseries.new('Memory Slab', - description=||| - Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. - - The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. - - SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. - SReclaimable: Part of Slab, that might be reclaimed, such as caches. - |||) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_SUnreclaim_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='SUnreclaim' - )) - .addTarget(commonPromTarget( - expr='node_memory_SReclaimable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='SReclaimable' - )), - - local memoryAnonymous = - nodeTimeseries.new( - 'Memory Anonymous', - description=||| - Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. - - This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. - - Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, - and from Memory Shared with other processes, - which refers to memory regions that can be accessed and modified by multiple processes. - - AnonHugePages: Memory in anonymous huge pages. - AnonPages: Memory in user pages not backed by files. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_AnonHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='AnonHugePages' - )) - .addTarget(commonPromTarget( - expr='node_memory_AnonPages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='AnonPages' - )), - - local memoryHugePagesCounter = - nodeTimeseries.new( - 'Memory HugePages Counter', - description=||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - - HugePages_Free: Huge pages in the pool that are not yet allocated. - HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. - HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. - ||| - ) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Free{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Free' - )) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Rsvd{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Rsvd' - )) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Surp{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Surp' - )), - local memoryHugePagesSize = - nodeTimeseries.new( - 'Memory HugePages Size', - description=||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Total{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Huge pages total size' - )) - .addTarget(commonPromTarget( - expr='node_memory_Hugepagesize_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Huge page size' - )), - local memoryDirectMap = - nodeTimeseries.new( - 'Memory Direct Map', - description=||| - Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_DirectMap1G_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap1G' - )) - .addTarget(commonPromTarget( - expr='node_memory_DirectMap2M_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap2M' - )) - .addTarget(commonPromTarget( - expr='node_memory_DirectMap4k_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap4k' - )), - - local memoryBounce = - nodeTimeseries.new( - 'Memory Bounce', - description=||| - Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. - - To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. - - Bounce: Memory used for block device bounce buffers. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Bounce_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Bounce' - )), - local panelsGrid = - [ - c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 0 } }, - c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 0 } }, - { type: 'row', title: 'Vmstat', gridPos: { y: 25 } }, - memoryPagesInOut { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - memoryPagesSwapInOut { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - memoryPagesFaults { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - memoryOOMkiller { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - { type: 'row', title: 'Memstat', gridPos: { y: 50 } }, - memoryActiveInactive { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryActiveInactiveDetail { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryCommited { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memorySharedAndMapped { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryWriteAndDirty { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryVmalloc { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memorySlab { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryAnonymous { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryHugePagesCounter { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryHugePagesSize { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryDirectMap { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryBounce { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Memory' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-memory.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/network.libsonnet b/docs/node-mixin/dashboards/network.libsonnet deleted file mode 100644 index ceacd13e42..0000000000 --- a/docs/node-mixin/dashboards/network.libsonnet +++ /dev/null @@ -1,796 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local networkTrafficPanel = - commonPanels.networkTrafficGraph.new( - 'Network Traffic', - description=||| - Network interfaces utilisation by device and direction. - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - legendFormat='{{device}} transmitted', - )), - - local networkPacketsPanel = - nodeTimeseries.new( - 'Packets', - description=||| - packets received: Number of good packets received by the interface. - For hardware interfaces counts all good packets received from the device by the host, including packets which host had to drop at various stages of processing (even in the driver). - - packets transmitted: Number of packets successfully transmitted. - For hardware interfaces counts packets which host was able to successfully hand over to the device, - which does not necessarily mean that packets had been successfully transmitted out of the device, only that device acknowledged it copied them out of host memory. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkErrorsPanel = - nodeTimeseries.new( - 'Network Errors', - description=||| - errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. - - errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveErrorsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitErrorsPerSec, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkDropsPanel = - nodeTimeseries.new( - 'Dropped Packets', - description=||| - drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). - - drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveDropsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitDropsPerSec, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - local networkCompressedPanel = - nodeTimeseries.new( - 'Compressed Packets', - description=||| - compressed received: - Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - compressed transmitted: - Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkMulticastPanel = - nodeTimeseries.new( - 'Multicast Packets', - description=||| - Multicast packets received and transmitted. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit'), - - local networkFifoPanel = - nodeTimeseries.new( - 'Network FIFO', - description=||| - Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. - It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. - Packets are stored in the FIFO buffer until they can be transmitted or processed further. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkNFConntrack = - nodeTimeseries.new( - 'NF Conntrack', - description=||| - NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, - enforce firewall rules, perform NAT, and manage network address/port translation. - ||| - ) - .addTarget(commonPromTarget( - 'node_nf_conntrack_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='NF conntrack entries', - )) - .addTarget(commonPromTarget( - 'node_nf_conntrack_entries_limit{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='NF conntrack limits', - )) - .withFillOpacity(0), - - local networkSoftnetPanel = - nodeTimeseries.new( - 'Softnet Packets', - description=||| - Softnet packets are received by the network and queued for processing by the kernel's networking stack. - Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_softnet_processed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu }} proccessed', - )) - .addTarget(commonPromTarget( - 'irate(node_softnet_dropped_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu }} dropped', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('dropped') - .withAxisLabel('Dropped(-) | Processed(+)'), - - local networkSoftnetSqueezePanel = - nodeTimeseries.new( - 'Softnet Out of Quota', - description=||| - "Softnet Out of Quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. - This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_softnet_times_squeezed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu}} out of quota', - )) - .withDecimals(1) - .withUnits('pps'), - - local networkInterfacesTable = - nodePanels.table.new( - title='Network Interfaces Overview' - ) - // "Value #A" - .addTarget(commonPromTarget( - expr='node_network_up{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #B" - .addTarget(commonPromTarget( - expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #C" - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - format='table', - instant=true, - )) - // "Value #D" - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - format='table', - instant=true, - )) - // "Value #E" - .addTarget(commonPromTarget( - expr='node_arp_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #F" - .addTarget(commonPromTarget( - expr='node_network_mtu_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #G" - .addTarget(commonPromTarget( - expr='node_network_speed_bytes{%(nodeQuerySelector)s} * 8' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #H" - .addTarget(commonPromTarget( - expr='node_network_transmit_queue_length{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "VALUE #I" - .addTarget(commonPromTarget( - expr='node_network_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - .withTransform() - .joinByField(field='device') - .filterFieldsByName('device|address|duplex|Value.+') - .organize( - excludeByName={ - 'Value #I': true, - }, - renameByName= - { - device: 'Interface', - address: 'Address', - duplex: 'Duplex', - 'Value #A': 'Up', - 'Value #B': 'Carrier', - 'Value #C': 'Transmit', - 'Value #D': 'Receive', - 'Value #E': 'ARP entries', - 'Value #F': 'MTU', - 'Value #G': 'Speed', - 'Value #H': 'Queue length', - } - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Speed', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Carrier|Up', - }, - properties=[ - { - id: 'custom.displayMode', - value: 'color-text', - }, - { - id: 'mappings', - value: [ - { - type: 'value', - options: { - '0': { - text: 'Down', - color: 'light-red', - index: 1, - }, - '1': { - text: 'Up', - color: 'light-green', - index: 0, - }, - }, - }, - ], - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Transmit|Receive', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - { - id: 'custom.displayMode', - value: 'gradient-gauge', - }, - { - id: 'color', - value: { - mode: 'continuous-BlYlRd', - }, - }, - { - id: 'max', - value: 1000 * 1000 * 100, - }, - ] - ) - , - - local networkOperStatus = - nodeTimeseries.new( - title='Network Interfaces Carrier Status', - description='Network Interfaces Carrier Status', - ) - .withColor(mode='palette-classic') - .withFillOpacity(100) - .withLegend(mode='list') - .addTarget(commonPromTarget( - expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}}' - )) - + { - maxDataPoints: 100, - type: 'status-history', - fieldConfig+: { - defaults+: { - mappings+: [ - { - type: 'value', - options: { - '1': { - text: 'Up', - color: 'light-green', - index: 1, - }, - }, - }, - { - type: 'value', - options: { - '0': { - text: 'Down', - color: 'light-red', - index: 0, - }, - }, - }, - - ], - }, - }, - }, - // https://github.com/prometheus/node_exporter/pull/2346/files#diff-3699c850869aecf912f8e8272958b556913fc266534206833a5dcb7d6cca3610 - local networkSockstatTCP = - nodeTimeseries.new( - title='Sockets TCP', - description=||| - TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. - - Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_alloc{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Allocated' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 In use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 In use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_orphan{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Orphan sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_tw{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Time wait' - )), - - local networkSockstatUDP = - nodeTimeseries.new( - title='Sockets UDP', - description=||| - UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. - Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_UDPLITE_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 UDPLITE in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 UDP in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDPLITE6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 UDPLITE in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 UDP in use' - )), - - local networkSockstatOther = - nodeTimeseries.new( - title='Sockets Other', - description=||| - FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. - - RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_FRAG_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 Frag sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_FRAG6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 Frag sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_RAW_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 Raw sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_RAW6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 Raw sockets in use' - )), - - - local networkSockstatMemory = - nodeTimeseries.new( - title='Sockets Memory', - description=||| - Memory currently in use for sockets. - ||| - ) - .withMaxDataPoints(100) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory pages allocated for TCP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory pages allocated for UDP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory bytes allocated for TCP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory bytes allocated for UDP sockets' - )) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/bytes/', - }, - properties=[ - { - id: 'unit', - value: 'bytes', - }, - { - id: 'custom.drawStyle', - value: 'lines', - }, - { - id: 'custom.drawStyle', - value: 'bars', - }, - { - id: 'custom.stacking', - value: { - mode: 'normal', - group: 'A', - }, - }, - ] - ), - - local networkSockstatAll = - nodeTimeseries.new( - title='Sockets in use', - description='Number of sockets currently in use.', - ) - .addTarget(commonPromTarget( - expr='node_sockstat_sockets_used{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 sockets in use' - )), - - local networkNetstatIP = - nodeTimeseries.new( - title='IP octets', - description='Rate of IP octets received and transmitted.' - ) - .withUnits('oct/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_IpExt_InOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Octets received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_IpExt_OutOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Octets transmitted' - )), - - - local networkNetstatTCP = - nodeTimeseries.new( - title='TCP segments', - description='Rate of TCP segments received and transmitted.' - ) - .withUnits('seg/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_InSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_OutSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP transmitted' - )), - - local networkNetstatTCPerrors = - nodeTimeseries.new( - title='TCP errors rate', - description='Rate of TCP errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_ListenOverflows{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP overflow' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_ListenDrops{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP ListenDrops - SYNs to LISTEN sockets ignored' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_TCPSynRetrans{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP SYN rentransmits' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_RetransSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP retransmitted segments, containing one or more previously transmitted octets' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_InErrs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP received with errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_OutRsts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP segments sent with RST flag' - )), - - local networkNetstatUDP = - nodeTimeseries.new( - title='UDP datagrams', - description='Rate of UDP datagrams received and transmitted.' - ) - .withUnits('dat/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP transmitted' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 transmitted' - )), - - local networkNetstatUDPerrors = - nodeTimeseries.new( - title='UDP errors rate', - description='Rate of UDP datagrams received and transmitted with errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_UdpLite_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDPLite InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP NoPorts' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 NoPorts' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP receive buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 receive buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP send buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 send buffer errors' - )), - - - local networkNetstatICMP = - nodeTimeseries.new( - title='ICMP messages', - description="Rate of ICMP messages, like 'ping', received and transmitted." - ) - .withUnits('msg/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP transmitted' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 transmitted' - )), - - local networkNetstatICMPerrors = - nodeTimeseries.new( - title='ICMP errors rate', - description='Rate of ICMP messages received and transmitted with errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP Errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 Errors' - )), - - - local rows = - [ - row.new('Network') - .addPanel(networkInterfacesTable { span: 12 }) - .addPanel(networkTrafficPanel { span: 6 }) - .addPanel(networkOperStatus { span: 6 }) - .addPanel(networkErrorsPanel { span: 6 }) - .addPanel(networkDropsPanel { span: 6 }) - .addPanel(networkPacketsPanel { span: 6 }) - .addPanel(networkMulticastPanel { span: 6 }) - .addPanel(networkFifoPanel { span: 6 }) - .addPanel(networkCompressedPanel { span: 6 }) - .addPanel(networkNFConntrack { span: 6 }) - .addPanel(networkSoftnetPanel { span: 6 }) - .addPanel(networkSoftnetSqueezePanel { span: 6 }), - row.new('Network Sockets') - .addPanel(networkSockstatAll { span: 12 }) - .addPanel(networkSockstatTCP { span: 6 }) - .addPanel(networkSockstatUDP { span: 6 }) - .addPanel(networkSockstatMemory { span: 6 }) - .addPanel(networkSockstatOther { span: 6 }), - - row.new('Network Netstat') - .addPanel(networkNetstatIP { span: 12 }) - .addPanel(networkNetstatTCP { span: 6 }) - .addPanel(networkNetstatTCPerrors { span: 6 }) - .addPanel(networkNetstatUDP { span: 6 }) - .addPanel(networkNetstatUDPerrors { span: 6 }) - .addPanel(networkNetstatICMP { span: 6 }) - .addPanel(networkNetstatICMPerrors { span: 6 }), - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Network' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-network.json'] - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addRows(rows) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet deleted file mode 100644 index a00eb1b9f7..0000000000 --- a/docs/node-mixin/dashboards/node.libsonnet +++ /dev/null @@ -1,19 +0,0 @@ -{ - local nodemixin = import './prom-mixin.libsonnet', - local cpu = import './cpu.libsonnet', - local system = import './system.libsonnet', - local memory = import './memory.libsonnet', - local disk = import './disk.libsonnet', - local network = import './network.libsonnet', - local fleet = import './fleet.libsonnet', - - grafanaDashboards+:: { - 'nodes.json': nodemixin.new(config=$._config, platform='Linux').dashboard, - 'nodes-darwin.json': nodemixin.new(config=$._config, platform='Darwin').dashboard, - 'nodes-system.json': system.new(config=$._config, platform='Linux').dashboard, - 'nodes-memory.json': memory.new(config=$._config, platform='Linux').dashboard, - 'nodes-network.json': network.new(config=$._config, platform='Linux').dashboard, - 'nodes-disk.json': disk.new(config=$._config, platform='Linux').dashboard, - 'nodes-fleet.json': fleet.new(config=$._config, platform='Linux').dashboard, - }, -} diff --git a/docs/node-mixin/dashboards/prom-mixin.libsonnet b/docs/node-mixin/dashboards/prom-mixin.libsonnet deleted file mode 100644 index a562844073..0000000000 --- a/docs/node-mixin/dashboards/prom-mixin.libsonnet +++ /dev/null @@ -1,180 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local statPanel = grafana.statPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; -local nodeTemplates = common.templates; - -{ - - new(config=null, platform=null):: { - - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local uptimePanel = - commonPanels.uptimeStat.new() - .addTarget(commonPromTarget(expr=q.uptime)), - - local cpuCountPanel = - commonPanels.infoStat.new('CPU Count') - .addTarget(commonPromTarget(expr=q.cpuCount)), - - local memoryTotalPanel = - commonPanels.infoStat.new('Memory Total') - .addTarget(commonPromTarget(expr=q.memoryTotal)) - .withUnits('bytes') - .withDecimals(0), - - local osPanel = - commonPanels.infoStat.new('OS') - .addTarget(commonPromTarget( - expr=q.osInfo, format='table' - )) { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, - - local nodeNamePanel = - commonPanels.infoStat.new('Hostname') - .addTarget(commonPromTarget( - expr=q.nodeInfo, format='table' - )) - { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, - - local kernelVersionPanel = - - commonPanels.infoStat.new('Kernel version') - .addTarget(commonPromTarget( - expr=q.nodeInfo, format='table' - )) - { options+: { reduceOptions+: { fields: '/^release$/' } } } - , - - local totalSwapPanel = - commonPanels.infoStat.new('Total swap') - .addTarget(commonPromTarget( - expr=q.memorySwapTotal - )) - .withUnits('bytes') - .withDecimals(0), - - local totalRootFSPanel = - commonPanels.infoStat.new('Root mount size') - .addTarget(commonPromTarget( - expr=q.fsSizeTotalRoot, - )) - .withUnits('bytes') - .withDecimals(0), - - local networkTrafficPanel = - commonPanels.networkTrafficGraph.new( - 'Network Traffic', description='Network transmitted and received (bits/s)', - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - legendFormat='{{device}} transmitted', - )), - - local networkErrorsDropsPanel = - nodePanels.timeseries.new( - 'Network Errors and Dropped Packets', - description=||| - errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. - - errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. - - drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). - - drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveErrorsPerSec, - legendFormat='{{device}} errors received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitErrorsPerSec, - legendFormat='{{device}} errors transmitted', - )) - .addTarget(commonPromTarget( - expr=q.networkReceiveDropsPerSec, - legendFormat='{{device}} drops received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitDropsPerSec, - legendFormat='{{device}} drops transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('trasnmitted') - .withAxisLabel('out(-) | in(+)'), - - - local panelsGrid = - [ - // use negative gravity effect, max w=24, default h=8 - { type: 'row', title: 'Overview' }, - uptimePanel { gridPos: { x: 0, w: 6, h: 2 } }, - nodeNamePanel { gridPos: { x: 6, w: 6, h: 2 } }, - kernelVersionPanel { gridPos: { x: 12, w: 6, h: 2 } }, - osPanel { gridPos: { x: 18, w: 6, h: 2 } }, - cpuCountPanel { gridPos: { x: 0, w: 6, h: 2 } }, - memoryTotalPanel { gridPos: { x: 6, w: 6, h: 2 } }, - totalSwapPanel { gridPos: { x: 12, w: 6, h: 2 } }, - totalRootFSPanel { gridPos: { x: 18, w: 6, h: 2 } }, - { type: 'row', title: 'CPU' } { gridPos: { y: 25 } }, - c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6, y: 25 } }, - c.panelsWithTargets.idleCPU { gridPos: { x: 6, w: 12, h: 6, y: 25 } }, - c.panelsWithTargets.systemLoad { gridPos: { x: 18, w: 6, h: 6, y: 25 } }, - { type: 'row', title: 'Memory' } { gridPos: { y: 50 } }, - c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 50 } }, - c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 50 } }, - { type: 'row', title: 'Disk' } { gridPos: { y: 75 } }, - c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 75 } }, - c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 75 } }, - { type: 'row', title: 'Network' } { gridPos: { y: 100 } }, - networkTrafficPanel { gridPos: { x: 0, w: 12, h: 8, y: 100 } }, - networkErrorsDropsPanel { gridPos: { x: 12, w: 12, h: 8, y: 100 } }, - ], - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Overview' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then - dashboard.new( - '%sMacOS' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-darwin.json'], - ) - .addTemplates(templates) - .addPanels(panelsGrid), - - }, -} diff --git a/docs/node-mixin/dashboards/system.libsonnet b/docs/node-mixin/dashboards/system.libsonnet deleted file mode 100644 index e1bd58d759..0000000000 --- a/docs/node-mixin/dashboards/system.libsonnet +++ /dev/null @@ -1,150 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local cpuUsageModes = - nodeTimeseries.new( - 'CPU Usage', - description=||| - System: Processes executing in kernel mode. - User: Normal processes executing in user mode. - Nice: Niced processes executing in user mode. - Idle: Waiting for something to happen. - Iowait: Waiting for I/O to complete. - Irq: Servicing interrupts. - Softirq: Servicing softirqs. - Steal: Time spent in other operating systems when running in a virtualized environment. - ||| - ) - .withStacking('normal') - .withUnits('percent') - .withFillOpacity(100) - .withMax(100) - .withMin(0) - .addTarget(commonPromTarget( - expr=q.cpuUsageModes, - legendFormat='{{mode}}', - )), - local timeZoneOffset = - commonPanels.infoStat.new( - 'Timezone', - description='Timezone set on instance.' - ) - .addTarget(commonPromTarget( - expr=q.node_time_zone_offset_seconds, format='table' - )) - { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, - local timeSyncDrift = - nodeTimeseries.new( - 'Time Synchronized Drift', - description=||| - Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. - ||| - ) - .withUnits('s') - .addTarget(commonPromTarget( - expr=q.node_timex_estimated_error_seconds, - legendFormat='Estimated error in seconds', - )) - .addTarget(commonPromTarget( - expr=q.node_timex_offset_seconds, - legendFormat='Time offset in between local system and reference clock', - )) - .addTarget(commonPromTarget( - expr=q.node_timex_maxerror_seconds, - legendFormat='Maximum error in seconds' - )), - - local timeSynchronizedStatus = - nodeTimeseries.new( - 'Time Synchronized Status', - description='Status of time synchronization.' - ) - .withColor(mode='palette-classic') - .withFillOpacity(75) - .withLegend(show=false) - { - maxDataPoints: 100, - type: 'status-history', - fieldConfig+: { - defaults+: { - mappings+: [ - { - type: 'value', - options: { - '1': { - text: 'In sync', - color: 'light-green', - index: 1, - }, - }, - }, - { - type: 'value', - options: { - '0': { - text: 'Not in sync', - color: 'light-yellow', - index: 0, - }, - }, - }, - - ], - }, - }, - } - .addTarget(commonPromTarget( - expr=q.node_timex_sync_status, - legendFormat='Sync status', - )), - - local panelsGrid = - [ - //use negative gravity(skip y), max w=24, default h should be '6'. - c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6 } }, - c.panelsWithTargets.idleCPU { gridPos: { x: 6, h: 6, w: 9 } }, - cpuUsageModes { gridPos: { x: 15, h: 6, w: 9 } }, - //pseudorow y:25 - c.panelsWithTargets.systemLoad { gridPos: { x: 0, h: 6, w: 12, y: 25 } }, - c.panelsWithTargets.systemContextSwitches { gridPos: { x: 12, h: 6, w: 12, y: 25 } }, - { type: 'row', title: 'Time', gridPos: { x: 0, w: 24, y: 75 } }, - timeZoneOffset { gridPos: { x: 0, h: 3, w: 3, y: 75 } }, - timeSynchronizedStatus { gridPos: { x: 3, h: 3, w: 21, y: 75 } }, - timeSyncDrift { gridPos: { x: 0, h: 6, w: 24, y: 80 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode CPU and System' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-system.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet deleted file mode 100644 index 9de0c4103a..0000000000 --- a/docs/node-mixin/dashboards/use.libsonnet +++ /dev/null @@ -1,476 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; - -local datasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', -}; - -local CPUUtilisation = - graphPanel.new( - 'CPU Utilisation', - description='Total CPU utilisation percent.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local CPUSaturation = - // TODO: Is this a useful panel? At least there should be some explanation how load - // average relates to the "CPU saturation" in the title. - graphPanel.new( - 'CPU Saturation (Load1 per CPU)', - description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The value is as a percent compared to the number of CPU cores for the node.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local memoryUtilisation = - graphPanel.new( - 'Memory Utilisation', - description='Total memory utilisation in percent.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local memorySaturation = - graphPanel.new( - 'Memory Saturation (Major Page Faults)', - description='Rate of major memory page faults.', - datasource='$datasource', - span=6, - format='rds', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local networkUtilisation = - graphPanel.new( - 'Network Utilisation (Bytes Receive/Transmit)', - description='Network Utilisation (Bytes Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, - ) - .addSeriesOverride({ alias: '/Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; - -local networkSaturation = - graphPanel.new( - 'Network Saturation (Drops Receive/Transmit)', - description='Network Saturation (Drops Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, - ) - .addSeriesOverride({ alias: '/ Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; - -local diskIOUtilisation = - graphPanel.new( - 'Disk IO Utilisation', - description='Disk total IO seconds.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local diskIOSaturation = - graphPanel.new( - 'Disk IO Saturation', - description='Disk saturation (weighted seconds spent, 1 second rate)', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local diskSpaceUtilisation = - graphPanel.new( - 'Disk Space Utilisation', - description='Total disk utilisation percent', - datasource='$datasource', - span=12, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -{ - _clusterTemplate:: template.new( - name='cluster', - datasource='$datasource', - query='label_values(node_time_seconds, %s)' % $._config.clusterLabel, - current='', - hide=if $._config.showMultiCluster then '' else '2', - refresh=2, - includeAll=false, - sort=1 - ), - - grafanaDashboards+:: { - 'node-rsrc-use.json': - - dashboard.new( - '%sUSE Method / Node' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config, - refresh='time', - sort=1 - ) - ) - .addRow( - row.new('CPU') - .addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation'))) - ) - .addRow( - row.new('Memory') - .addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - .addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation.addTarget(prometheus.target( - ||| - sort_desc(1 - - ( - max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - / - max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - ) != 0 - ) - ||| % $._config, legendFormat='{{device}}' - )) - ) - ), - - 'node-cluster-rsrc-use.json': - dashboard.new( - '%sUSE Method / Cluster' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-cluster-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - ) != 0 ) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ||| % $._config, legendFormat='{{ instance }}' - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - ( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}' - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - ( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}', - )) - ) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - ( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - ( - instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum without (device) ( - max without (fstype, mountpoint) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - - - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}))) - ||| % $._config, legendFormat='{{instance}}' - )) - ) - ), - } + - if $._config.showMultiCluster then { - 'node-multicluster-rsrc-use.json': - dashboard.new( - '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-multicluster-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - sum( - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s} - ) != 0) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - memorySaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum ( - sum without (device) ( - max without (fstype, mountpoint, instance, pod) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ), - } else {}, -} diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 721d4833a0..7459652bfa 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -4,8 +4,17 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" } }, "version": "master" @@ -13,12 +22,12 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet-7.0" + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" } }, "version": "master" } ], - "legacyImports": false -} + "legacyImports": true +} \ No newline at end of file diff --git a/docs/node-mixin/lib/aix/README.md b/docs/node-mixin/lib/aix/README.md new file mode 100644 index 0000000000..43fb44fe5b --- /dev/null +++ b/docs/node-mixin/lib/aix/README.md @@ -0,0 +1,42 @@ +# AIX exporter observability lib + +This jsonnet observability lib can be used to generate observability package for node exporter(AIX). + +## Import + +```sh +jb init +jb install https://github.com/grafana/node_exporter/docs/node-mixin/lib/aix +``` + +## Examples + +### Example 1: Basic example + +You can use observ-lib to fill in monitoring-mixin structure: + +```jsonnet +// mixin.libsonnet file +local aixlib = import 'aix/main.libsonnet'; + +local aix = + aixlib.new() + + aixlib.withConfigMixin({ + filteringSelector: 'job=~".*aix.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'AIX / ', + dashboardTags: ['aix-mixin'], + uid: 'aix', + // enable loki logs + enableLokiLogs: true, + }); + +{ + grafanaDashboards+:: aix.grafana.dashboards, + prometheusAlerts+:: aix.prometheus.alerts, + prometheusRules+:: aix.prometheus.recordingRules, +} + +``` +For more examples see [node-mixin/lib/linux](../linux). diff --git a/docs/node-mixin/lib/aix/alerts.libsonnet b/docs/node-mixin/lib/aix/alerts.libsonnet new file mode 100644 index 0000000000..46f66bd5a3 --- /dev/null +++ b/docs/node-mixin/lib/aix/alerts.libsonnet @@ -0,0 +1,23 @@ +{ + new(this, parentPrometheus): + { + groups: + //keep only alerts listed in alertsKeep + std.filter( + function(group) std.length(group.rules) > 0, + [ + { + name: group.name, + rules: [ + rule + for rule in group.rules + if std.length(std.find(rule.alert, this.config.alertsKeep)) > 0 + ], + } + for group in parentPrometheus.alerts.groups + ], + + ), + + }, +} diff --git a/docs/node-mixin/lib/aix/config.libsonnet b/docs/node-mixin/lib/aix/config.libsonnet new file mode 100644 index 0000000000..3fa602f0d1 --- /dev/null +++ b/docs/node-mixin/lib/aix/config.libsonnet @@ -0,0 +1,29 @@ +{ + // Rest of the config is imported from linux + filteringSelector: 'job="aix"', + dashboardNamePrefix: 'MacOS / ', + //uid prefix + uid: 'aix', + + dashboardTags: ['aix-mixin'], + + + // Alerts to keep from node-observ-lib: + alertsKeep: [ + 'NodeFilesystemAlmostOutOfSpace', + 'NodeNetworkReceiveErrs', + 'NodeNetworkTransmitErrs', + 'NodeTextFileCollectorScrapeError', + 'NodeFilesystemFilesFillingUp', + 'NodeFilesystemAlmostOutOfFiles', + 'NodeCPUHighUsage', + 'NodeSystemSaturation', + 'NodeMemoryHighUtilization', + 'NodeDiskIOSaturation', + 'NodeHasRebooted', + 'NodeProcessesCountIsHigh', + ], + // logs lib related + enableLokiLogs: false, + +} diff --git a/docs/node-mixin/lib/aix/main.libsonnet b/docs/node-mixin/lib/aix/main.libsonnet new file mode 100644 index 0000000000..8380bd5bfa --- /dev/null +++ b/docs/node-mixin/lib/aix/main.libsonnet @@ -0,0 +1,65 @@ +local g = import '../g.libsonnet'; +local nodelib = import '../linux/main.libsonnet'; +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local panels = import './panels.libsonnet'; +local targets = import './targets.libsonnet'; + + +// inherit nodelib +nodelib +{ + + new(): + super.new() + + nodelib.withConfigMixin(config) + + + { + local this = self, + local parentGrafana = super.grafana, + local parentPrometheus = super.prometheus, + + grafana+: { + // drop backToFleet link + links+: { + local link = g.dashboard.link, + backToFleet:: {}, + backToOverview: + link.link.new('Back to ' + this.config.dashboardNamePrefix + 'overview', '/d/' + this.grafana.dashboards['nodes-darwin.json'].uid) + + link.link.options.withKeepTime(true), + }, + annotations: { + // keep only reboot annotation + reboot: parentGrafana.annotations.reboot, + }, + // override targets (memory) + targets+: targets.new(this), + // override panels (update description and targets in panels) + panels+: panels.new(this), + + // keep only overview and logs(optionally) dashes + dashboards: + { + 'nodes-aix.json': + parentGrafana.dashboards['nodes.json'] + + g.dashboard.withUid( + (if this.config.uid == 'aix' then std.md5('nodes-aix.json') else this.config.uid + '-overview') + ), + } + + + ( + if this.config.enableLokiLogs + then + { + 'logs-aix.json': parentGrafana.dashboards['logs.json'], + } + else {} + ), + }, + prometheus+: { + recordingRules: {}, + alerts: alerts.new(this, parentPrometheus), + }, + }, + +} diff --git a/docs/node-mixin/lib/aix/panels.libsonnet b/docs/node-mixin/lib/aix/panels.libsonnet new file mode 100644 index 0000000000..8a75182177 --- /dev/null +++ b/docs/node-mixin/lib/aix/panels.libsonnet @@ -0,0 +1,24 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + + // override description and targets + memory+: { + memoryUsageTsBytes+: + g.panel.timeSeries.queryOptions.withTargets([ + t.memory.memoryUsedBytes, + t.memory.memoryTotalBytes, + t.memory.memorySwapUsedBytes, + ]) + + commonlib.panels.generic.timeSeries.threshold.stylizeByRegexp('Physical memory'), + }, + + }, +} diff --git a/docs/node-mixin/lib/aix/targets.libsonnet b/docs/node-mixin/lib/aix/targets.libsonnet new file mode 100644 index 0000000000..a0704a0405 --- /dev/null +++ b/docs/node-mixin/lib/aix/targets.libsonnet @@ -0,0 +1,61 @@ +local g = import '../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + // override memory targets (other metrics in macos) + memory+: { + memoryTotalBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_total_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Physical memory'), + + memoryUsedBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_total_bytes{%(queriesSelector)s} - + node_memory_available_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + + memoryUsagePercent: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + ( + node_memory_total_bytes{%(queriesSelector)s} - + node_memory_available_bytes{%(queriesSelector)s} + ) + /avg(node_memory_total_bytes{%(queriesSelector)s}) + ) * 100 + ||| + % variables, + ), + memorySwapTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_total_bytes{%(queriesSelector)s}' % variables + ), + + memorySwapUsedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_used_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Swap used'), + }, + }, +} diff --git a/docs/node-mixin/lib/common.libsonnet b/docs/node-mixin/lib/common.libsonnet deleted file mode 100644 index 66d45fd5c5..0000000000 --- a/docs/node-mixin/lib/common.libsonnet +++ /dev/null @@ -1,707 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -{ - - new(config=null, platform=null):: { - - local c = self, - - local labelsToRegexSelector(labels) = - std.join(',', ['%s=~"$%s"' % [label, label] for label in labels]), - local labelsToLegend(labels) = - std.join('/', ['{{%s}}' % [label] for label in labels]), - - local labelsToURLvars(labels, prefix) = - std.join('&', ['var-%s=${%s%s}' % [label, prefix, label] for label in labels]), - // export - labelsToLegend:: labelsToLegend, - labelsToURLvars:: labelsToURLvars, - // add to all queries but not templates - local nodeQuerySelector = labelsToRegexSelector(std.split(config.groupLabels + ',' + config.instanceLabels, ',')), - nodeQuerySelector:: nodeQuerySelector, - - // common templates - local prometheusDatasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - - local chainLabelsfold(prev, label) = { - chain: - if std.length(prev) > 0 - then - [[label] + prev.chain[0]] + prev.chain - else - [[label]], - }, - - local chainLabels(labels) = - [ - { - label: l[0:1][0], - chainSelector: labelsToRegexSelector(std.reverse(l[1:])), - } - for l in std.reverse(std.foldl(chainLabelsfold, labels, init={}).chain) - ], - - local groupTemplates = - [ - template.new( - name=label.label, - label=label.label, - datasource='$datasource', - query='', - current='', - refresh=2, - includeAll=true, - // do not use .*, will get series without such label at all when ALL is selected, ignoring nodeExporterSelector results - allValues=null, - multi=true, - sort=1 - ) - { - query: if platform == 'Darwin' then 'label_values(node_uname_info{sysname="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector } - else 'label_values(node_uname_info{sysname!="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector }, - } - for label in chainLabels(std.split(config.groupLabels, ',')) - ], - - local instanceTemplates = - [ - template.new( - label.label, - '$datasource', - 'label_values(node_uname_info{%(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: labelsToRegexSelector(std.split(config.groupLabels, ',')) + ',' + label.chainSelector }, - sort=1, - refresh='time', - label=label.label, - ) - for label in chainLabels(std.split(config.instanceLabels, ',')) - ], - - // return common templates - templates: [prometheusDatasourceTemplate] + groupTemplates + instanceTemplates, - // return templates where instance select is not required - groupDashboardTemplates: [prometheusDatasourceTemplate] + groupTemplates, - - local rebootAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: 'node_boot_time_seconds{%(nodeQuerySelector)s}*1000 > $__from < $__to' % config { nodeQuerySelector: nodeQuerySelector }, - name: 'Reboot', - iconColor: 'light-orange', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'Reboot', - useValueForTime: 'on', - }, - local memoryOOMkillerAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: 'increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval])' % config { nodeQuerySelector: nodeQuerySelector }, - name: 'OOMkill', - iconColor: 'light-purple', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'OOMkill', - }, - local newKernelAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: ||| - changes( - sum by (%(instanceLabels)s) ( - group by (%(instanceLabels)s,release) (node_uname_info{%(nodeQuerySelector)s}) - ) - [$__interval:1m] offset -$__interval) > 1 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - name: 'Kernel update', - iconColor: 'light-blue', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'Kernel update', - step: '5m', // must be larger than possible scrape periods - }, - // return common annotations - annotations: [rebootAnnotation, memoryOOMkillerAnnotation, newKernelAnnotation], - - // return common prometheus target (with project defaults) - commonPromTarget( - expr=null, - intervalFactor=1, - datasource='$datasource', - legendFormat=null, - format='timeseries', - instant=null, - hide=null, - interval=null, - ):: - prometheus.target( - expr=expr, - intervalFactor=intervalFactor, - datasource=datasource, - legendFormat=legendFormat, - format=format, - instant=instant, - hide=hide, - interval=interval - ), - // link to fleet panel - links:: { - fleetDash:: grafana.link.dashboards( - asDropdown=false, - title='Back to Node Fleet Overview', - tags=[], - includeVars=false, - keepTime=true, - url='d/' + config.grafanaDashboardIDs['nodes-fleet.json'] - ) { type: 'link', icon: 'dashboard' }, - nodeDash:: grafana.link.dashboards( - asDropdown=false, - title='Back to Node Overview', - tags=[], - includeVars=true, - keepTime=true, - url='d/' + config.grafanaDashboardIDs['nodes.json'] - ) { type: 'link', icon: 'dashboard' }, - otherDashes:: grafana.link.dashboards( - asDropdown=true, - title='Other Node Dashboards', - includeVars=true, - keepTime=true, - tags=(config.dashboardTags), - ), - // used in fleet table - instanceDataLinkForTable:: { - title: 'Drill down to instance ${__data.fields.%s}' % std.split(config.instanceLabels, ',')[0], - url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__data.fields.') + '&${__url_time_range}&var-datasource=${datasource}', - }, - // used in ts panels - instanceDataLink:: { - title: 'Drill down to instance ${__field.labels.%s}' % std.split(config.instanceLabels, ',')[0], - url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__field.labels.') + '&${__url_time_range}&var-datasource=${datasource}', - }, - }, - // return common queries that could be used in multiple dashboards - queries:: { - systemLoad1:: 'avg by (%(instanceLabels)s) (node_load1{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - systemLoad5:: 'avg by (%(instanceLabels)s) (node_load5{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - systemLoad15:: 'avg by (%(instanceLabels)s) (node_load15{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - uptime:: 'time() - node_boot_time_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - cpuCount:: 'count by (%(instanceLabels)s) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"})' % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsage:: - ||| - (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s))) - - - avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(nodeQuerySelector)s}[$__rate_interval])))) * 100) - / - count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s)) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsageModes:: - ||| - sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval])) - / on(%(instanceLabels)s) - group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval]))) * 100 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsagePerCore:: - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeQuerySelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"}) - ) * 100 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - memoryTotal:: 'node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - memorySwapTotal:: 'node_memory_SwapTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - memoryUsage:: - ||| - 100 - - ( - avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}) / - avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(nodeQuerySelector)s}) - * 100 - ) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - - process_max_fds:: 'process_max_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - process_open_fds:: 'process_open_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - fsSizeTotalRoot:: 'node_filesystem_size_bytes{%(nodeQuerySelector)s, mountpoint="/",fstype!="rootfs"}' % config { nodeQuerySelector: nodeQuerySelector }, - osInfo:: 'node_os_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - nodeInfo:: 'node_uname_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_disk_reads_completed_total:: 'irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - node_disk_writes_completed_total:: 'irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskReadTime:: 'rate(node_disk_read_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskWriteTime:: 'rate(node_disk_written_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskIoTime:: 'rate(node_disk_io_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskWaitReadTime:: - ||| - irate(node_disk_read_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - diskWaitWriteTime:: - ||| - irate(node_disk_write_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - diskAvgQueueSize:: 'irate(node_disk_io_time_weighted_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskSpaceUsage:: - ||| - sort_desc(1 - - ( - max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - / - max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ) != 0 - ) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_avail_bytes:: 'node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_files_free:: 'node_filesystem_files_free{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_files:: 'node_filesystem_files{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_readonly:: 'node_filesystem_readonly{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_device_error:: 'node_filesystem_device_error{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveBitsPerSec:: 'irate(node_network_receive_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitBitsPerSec:: 'irate(node_network_transmit_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveErrorsPerSec:: 'irate(node_network_receive_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitErrorsPerSec:: 'irate(node_network_transmit_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveDropsPerSec:: 'irate(node_network_receive_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitDropsPerSec:: 'irate(node_network_transmit_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - - systemContextSwitches:: 'irate(node_context_switches_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - systemInterrupts:: 'irate(node_intr_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - - //time - node_timex_estimated_error_seconds:: 'node_timex_estimated_error_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_timex_offset_seconds:: 'node_timex_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_timex_maxerror_seconds:: 'node_timex_maxerror_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - node_timex_sync_status:: 'node_timex_sync_status{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_time_zone_offset_seconds:: 'node_time_zone_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_systemd_units:: 'node_systemd_units{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - - }, - // share across dashboards - panelsWithTargets:: { - // cpu - idleCPU:: - nodePanels.timeseries.new( - 'CPU Usage', - description='Total CPU utilisation percent.' - ) - .withUnits('percent') - .withStacking('normal') - .withMin(0) - .withMax(100) - .addTarget(c.commonPromTarget( - expr=c.queries.cpuUsagePerCore, - legendFormat='cpu {{cpu}}', - )), - - systemLoad:: - nodePanels.timeseries.new( - 'Load Average', - description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', - ) - .withUnits('short') - .withMin(0) - .withFillOpacity(0) - .addTarget(c.commonPromTarget(c.queries.systemLoad1, legendFormat='1m load average')) - .addTarget(c.commonPromTarget(c.queries.systemLoad5, legendFormat='5m load average')) - .addTarget(c.commonPromTarget(c.queries.systemLoad15, legendFormat='15m load average')) - .addTarget(c.commonPromTarget(c.queries.cpuCount, legendFormat='logical cores')) - .addOverride( - matcher={ - id: 'byName', - options: 'logical cores', - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - ] - ), - cpuStatPanel:: - commonPanels.percentUsageStat.new( - 'CPU Usage', - description='Total CPU utilisation percent.' - ) - .addTarget(c.commonPromTarget( - expr=c.queries.cpuUsage - )), - systemContextSwitches:: - nodePanels.timeseries.new( - 'Context Switches / Interrupts', - description=||| - Context switches occur when the operating system switches from running one process to another. - Interrupts are signals sent to the CPU by external devices to request its attention. - - A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. - ||| - ) - .addTarget(c.commonPromTarget(c.queries.systemContextSwitches, legendFormat='Context Switches')) - .addTarget(c.commonPromTarget(c.queries.systemInterrupts, legendFormat='Interrupts')), - - diskSpaceUsage:: - nodePanels.table.new( - title='Disk Space Usage', - description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.', - ) - .setFieldConfig(unit='decbytes') - //.addThresholdStep(color='light-green', value=null) - .addThresholdStep(color='light-blue', value=null) - .addThresholdStep(color='light-yellow', value=0.8) - .addThresholdStep(color='light-red', value=0.9) - .addTarget(c.commonPromTarget( - ||| - max by (mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='', - instant=true, - format='table' - )) - .addTarget(c.commonPromTarget( - ||| - max by (mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='', - instant=true, - format='table', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mounted on', - }, - properties=[ - { - id: 'custom.width', - value: 260, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Size', - }, - properties=[ - - { - id: 'custom.width', - value: 93, - }, - - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Used', - }, - properties=[ - { - id: 'custom.width', - value: 72, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Available', - }, - properties=[ - { - id: 'custom.width', - value: 88, - }, - ], - ) - - .addOverride( - matcher={ - id: 'byName', - options: 'Used, %', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.displayMode', - value: 'basic', - }, - { - id: 'max', - value: 1, - }, - { - id: 'min', - value: 0, - }, - ] - ) - .sortBy('Mounted on') - + { - transformations+: [ - { - id: 'groupBy', - options: { - fields: { - 'Value #A': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - 'Value #B': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - mountpoint: { - aggregations: [], - operation: 'groupby', - }, - }, - }, - }, - { - id: 'merge', - options: {}, - }, - { - id: 'calculateField', - options: { - alias: 'Used', - binary: { - left: 'Value #A (lastNotNull)', - operator: '-', - reducer: 'sum', - right: 'Value #B (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'calculateField', - options: { - alias: 'Used, %', - binary: { - left: 'Used', - operator: '/', - reducer: 'sum', - right: 'Value #A (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'organize', - options: { - excludeByName: {}, - indexByName: {}, - renameByName: { - 'Value #A (lastNotNull)': 'Size', - 'Value #B (lastNotNull)': 'Available', - mountpoint: 'Mounted on', - }, - }, - }, - ], - }, - memoryGraphPanelPrototype:: - nodePanels.timeseries.new( - 'Memory Usage', - description='Memory usage by category, measured in bytes.', - ) - .withMin(0) - .withUnits('bytes'), - memoryGraph:: - if platform == 'Linux' then - self.memoryGraphPanelPrototype - { - description: ||| - Used: The amount of physical memory currently in use by the system. - Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. - Free: The amount of physical memory that is currently not in use. - Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. - Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. - |||, - } - { stack: true } - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_MemTotal_bytes{%(nodeQuerySelector)s} - - - node_memory_MemFree_bytes{%(nodeQuerySelector)s} - - - node_memory_Buffers_bytes{%(nodeQuerySelector)s} - - - node_memory_Cached_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory used' - )) - .addTarget(c.commonPromTarget('node_memory_Buffers_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory buffers')) - .addTarget(c.commonPromTarget('node_memory_Cached_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory cached')) - .addTarget(c.commonPromTarget('node_memory_MemFree_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory free')) - .addTarget(c.commonPromTarget('node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory available')) - .addTarget(c.commonPromTarget('node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory total')) - else if platform == 'Darwin' then - // not useful to stack - self.memoryGraphPanelPrototype { stack: false } - .addTarget(c.commonPromTarget('node_memory_total_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Physical Memory')) - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_internal_bytes{%(nodeQuerySelector)s} - - node_memory_purgeable_bytes{%(nodeQuerySelector)s} + - node_memory_wired_bytes{%(nodeQuerySelector)s} + - node_memory_compressed_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory Used' - )) - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_internal_bytes{%(nodeQuerySelector)s} - - node_memory_purgeable_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='App Memory' - )) - .addTarget(c.commonPromTarget('node_memory_wired_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Wired Memory')) - .addTarget(c.commonPromTarget('node_memory_compressed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Compressed')), - - // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. - memoryGaugePanelPrototype:: - commonPanels.percentUsageStat.new( - 'Memory Usage', - description='Total memory utilisation.', - ), - - memoryGauge:: - if platform == 'Linux' then - self.memoryGaugePanelPrototype - - .addTarget(c.commonPromTarget(c.queries.memoryUsage)) - - else if platform == 'Darwin' then - self.memoryGaugePanelPrototype - .addTarget(c.commonPromTarget( - ||| - ( - ( - avg(node_memory_internal_bytes{%(nodeQuerySelector)s}) - - avg(node_memory_purgeable_bytes{%(nodeQuerySelector)s}) + - avg(node_memory_wired_bytes{%(nodeQuerySelector)s}) + - avg(node_memory_compressed_bytes{%(nodeQuerySelector)s}) - ) / - avg(node_memory_total_bytes{%(nodeQuerySelector)s}) - ) - * - 100 - ||| % config { nodeQuerySelector: c.nodeQuerySelector } - )), - diskIO:: - nodePanels.timeseries.new( - 'Disk I/O', - description='Disk read/writes in bytes, and total IO seconds.' - ) - .withFillOpacity(0) - .withMin(0) - .addTarget(c.commonPromTarget( - c.queries.diskReadTime, - legendFormat='{{device}} read', - )) - .addTarget(c.commonPromTarget( - c.queries.diskWriteTime, - legendFormat='{{device}} written', - )) - .addTarget(c.commonPromTarget( - c.queries.diskIoTime, - legendFormat='{{device}} io time', - )) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/ read| written/', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/ io time/', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.axisSoftMax', - value: 1, - }, - { - id: 'custom.drawStyle', - value: 'points', - }, - ] - ), - }, - }, - -} diff --git a/docs/node-mixin/lib/g.libsonnet b/docs/node-mixin/lib/g.libsonnet new file mode 100644 index 0000000000..f89dcc0641 --- /dev/null +++ b/docs/node-mixin/lib/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v11.0.0/main.libsonnet' diff --git a/docs/node-observ-lib/linux/README.md b/docs/node-mixin/lib/linux/README.md similarity index 91% rename from docs/node-observ-lib/linux/README.md rename to docs/node-mixin/lib/linux/README.md index 3582c0cbb9..4057f85fd6 100644 --- a/docs/node-observ-lib/linux/README.md +++ b/docs/node-mixin/lib/linux/README.md @@ -6,7 +6,7 @@ This jsonnet observability lib can be used to generate observability package for ```sh jb init -jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +jb install https://github.com/grafana/node_exporter/docs/node-mixin/lib/linux ``` ## Examples @@ -17,7 +17,7 @@ You can use observ-lib to fill in monitoring-mixin structure: ```jsonnet // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -45,7 +45,7 @@ local linux = ```jsonnet // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -82,7 +82,7 @@ local linux = // mixin.libsonnet file local configOverride = import './overrides.libsonnet'; -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -101,7 +101,7 @@ local linux = ```jsonnet local g = import './g.libsonnet'; // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-mixin/lib/linux/alerts/alerts.libsonnet similarity index 88% rename from docs/node-observ-lib/linux/alerts.libsonnet rename to docs/node-mixin/lib/linux/alerts/alerts.libsonnet index 8cc89d8fdf..070a321935 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-mixin/lib/linux/alerts/alerts.libsonnet @@ -10,7 +10,7 @@ ( node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d and - predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], 24*60*60) < 0 and node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) @@ -198,7 +198,7 @@ ||| % this.config, annotations: { summary: 'Number of conntrack are getting close to the limit.', - description: '{{ $value | humanizePercentage }} of conntrack entries are used.', + description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.', }, labels: { severity: 'warning', @@ -319,7 +319,7 @@ { alert: 'NodeCPUHighUsage', expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode!~"idle|iowait"}[2m]))) * 100 > %(cpuHighUsageThreshold)d ||| % this.config, 'for': '15m', labels: { @@ -329,6 +329,7 @@ summary: 'High CPU usage.', description: ||| CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + Note that 'iowait' CPU mode is excluded from total utilization. ||| % this.config, }, }, @@ -395,7 +396,7 @@ annotations: { summary: 'Disk IO queue is high.', description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 30 minutes, is currently at {{ printf "%%.2f" $value }}. This symptom might indicate disk saturation. ||| % this.config, }, @@ -411,7 +412,10 @@ }, annotations: { summary: 'Systemd service has entered failed state.', - description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', + description: ||| + Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}. + If this service is not required anymore, remove it or run `systemctl reset-failed {{ $labels.name }}` in order to reset the failed counter. + |||, }, }, { @@ -428,6 +432,53 @@ description: 'Systemd service {{ $labels.name }} has been restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.', }, }, + { + alert: 'NodeHasRebooted', + expr: ||| + (time() - node_boot_time_seconds{%(filteringSelector)s}) < 600 + and + (time() - (node_boot_time_seconds{%(filteringSelector)s} offset 10m)) > 600 + ||| % this.config, + labels: + { + severity: 'info', + }, + annotations: + { + summary: 'Node has rebooted.', + description: 'Node {{ $labels.instance }} has rebooted {{ $value | humanize }} seconds ago.', + }, + }, + { + alert: 'NodeProcessesCountIsHigh', + expr: ||| + node_procs_running{%(filteringSelector)s} > %(processLimitThresholdWarning)d + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'There is more than %(processLimitThresholdWarning)s running processes on host.' + % this.config, + description: 'There is {{ $value }} running processes on {{ $labels.instance }}.', + }, + }, + { + alert: 'NodeProcessesCountIsHigh', + expr: ||| + node_procs_running{%(filteringSelector)s} > %(processLimitThresholdCritical)d + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'There is more than %(processLimitThresholdCritical)s running processes on host.' + % this.config, + description: 'There is {{ $value }} running processes on {{ $labels.instance }}.', + }, + }, ] + if this.config.enableHardware then [{ diff --git a/docs/node-observ-lib/linux/annotations.libsonnet b/docs/node-mixin/lib/linux/annotations.libsonnet similarity index 84% rename from docs/node-observ-lib/linux/annotations.libsonnet rename to docs/node-mixin/lib/linux/annotations.libsonnet index 5b8cb7ff79..205530bd8f 100644 --- a/docs/node-observ-lib/linux/annotations.libsonnet +++ b/docs/node-mixin/lib/linux/annotations.libsonnet @@ -6,14 +6,14 @@ local commonlib = import 'common-lib/common/main.libsonnet'; reboot: commonlib.annotations.reboot.new( title='Reboot', - target=this.grafana.targets.reboot, + target=this.grafana.targets.events.reboot, instanceLabels=std.join(',', this.config.instanceLabels), ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)), memoryOOM: commonlib.annotations.base.new( 'OOMkill', - this.grafana.targets.memoryOOMkiller + this.grafana.targets.events.memoryOOMkiller ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)) + commonlib.annotations.base.withTextFormat('') @@ -24,7 +24,7 @@ local commonlib = import 'common-lib/common/main.libsonnet'; kernelUpdate: commonlib.annotations.base.new( 'Kernel update', - this.grafana.targets.kernelUpdate + this.grafana.targets.events.kernelUpdate ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)) + commonlib.annotations.base.withTextFormat('') @@ -41,25 +41,25 @@ local commonlib = import 'common-lib/common/main.libsonnet'; { serviceFailed: commonlib.annotations.serviceFailed.new( title='Service failed', - target=this.grafana.targets.serviceFailed, + target=this.grafana.targets.events.serviceFailed, ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])), criticalEvents: commonlib.annotations.fatal.new( title='Critical system event', - target=this.grafana.targets.criticalEvents, + target=this.grafana.targets.events.criticalEvents, ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])), sessionOpened: commonlib.annotations.base.new( title='Session opened', - target=this.grafana.targets.sessionOpened, + target=this.grafana.targets.events.sessionOpened, ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) { hide: true }, sessionClosed: commonlib.annotations.base.new( title='Session closed', - target=this.grafana.targets.sessionOpened, + target=this.grafana.targets.events.sessionOpened, ) + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) { hide: true }, diff --git a/docs/node-observ-lib/linux/config.libsonnet b/docs/node-mixin/lib/linux/config.libsonnet similarity index 81% rename from docs/node-observ-lib/linux/config.libsonnet rename to docs/node-mixin/lib/linux/config.libsonnet index 94acca6184..728512298a 100644 --- a/docs/node-observ-lib/linux/config.libsonnet +++ b/docs/node-mixin/lib/linux/config.libsonnet @@ -6,14 +6,13 @@ // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. // 'uid' - UID to prefix all dashboards original uids - filteringSelector: std.get(self, 'nodeExporterSelector', default='job="node"'), groupLabels: ['job'], instanceLabels: ['instance'], dashboardNamePrefix: 'Node exporter / ', + //uid prefix uid: 'node', - - dashboardTags: [self.uid], + dashboardTags: ['node-exporter-mixin'], // Select the fstype for filesystem-related queries. If left // empty, all filesystems are selected. If you have unusual @@ -59,20 +58,26 @@ // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d // and if the provided thresholds have been reached right now. - // In some cases you'll want to adjust these, e.g. by default Kubernetes + // In some cases you'll want to adjust these, e.g., by default, Kubernetes // runs the image garbage collection when the disk usage reaches 85% // of its available space. In that case, you'll want to reduce the // critical threshold below to something like 14 or 15, otherwise // the alert could fire under normal node usage. + // Additionally, the prediction window for the alert can be configured + // to account for environments where disk usage can fluctuate within + // a short time frame. By extending the prediction window, you can + // reduce false positives caused by temporary spikes, providing a + // more accurate prediction of disk space issues. fsSpaceFillingUpWarningThreshold: 40, fsSpaceFillingUpCriticalThreshold: 20, + fsSpaceFillingUpPredictionWindow: '6h', // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemAlmostOutOfSpace' alerts. fsSpaceAvailableWarningThreshold: 5, fsSpaceAvailableCriticalThreshold: 3, - // Memory utilzation (%) level on which to trigger the + // Memory utilization (%) level on which to trigger the // 'NodeMemoryHighUtilization' alert. memoryHighUtilizationThreshold: 90, @@ -91,11 +96,25 @@ rateInterval: '5m', - dashboardPeriod: 'now-1h', + dashboardInterval: 'now-1h', dashboardTimezone: 'default', dashboardRefresh: '1m', - // logs lib related + // Opt-in for USE method dashboards + enableUseDashboards: true, + // Opt-in for multi-cluster support (USE method). + showMultiCluster: true, + //used in USE dashboards only. For others, add cluster label to groupLabels var. + clusterLabel: 'cluster', + + // Thresholds for process count + processLimitThresholdWarning: 400, + processLimitThresholdCritical: 600, + + //custom allValue to use for dashboard variables + customAllValue: '.+', + + // loki logs related related enableLokiLogs: false, extraLogLabels: ['transport', 'unit', 'level'], logsVolumeGroupBy: 'level', @@ -106,4 +125,5 @@ | label_format timestamp="{{__timestamp__}}" | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}` |||, + } diff --git a/docs/node-mixin/lib/linux/dashboards.libsonnet b/docs/node-mixin/lib/linux/dashboards.libsonnet new file mode 100644 index 0000000000..b1f749a0bb --- /dev/null +++ b/docs/node-mixin/lib/linux/dashboards.libsonnet @@ -0,0 +1,223 @@ +local g = import '../g.libsonnet'; +local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; +{ + local root = self, + new(this): + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables.main; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardInterval; + local timezone = this.config.dashboardTimezone; + local panels = this.grafana.panels; + local rows = this.grafana.rows; + local stat = g.panel.stat; + { + 'fleet.json': + local title = prefix + 'fleet overview'; + g.dashboard.new(title) + + g.dashboard.withPanels( + g.util.grid.wrapPanels(rows.linux.fleet.panels, 12, 7) + ) + // hide link to self + + root.applyCommon(vars.multiInstance, uid + '-fleet', tags, links { backToFleet+:: {}, backToOverview+:: {} }, annotations, timezone, refresh, period), + 'nodes.json': + g.dashboard.new(prefix + 'overview') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.linux.overview, + rows.linux.cpuOverview, + rows.linux.memoryOverview, + rows.linux.diskOverview, + rows.linux.networkOverview, + ] + + + if this.config.enableHardware then + [ + rows.linux.hardware, + ] else [] + , 6, 2 + ) + ) + ) + // defaults to uid=nodes for backward compatibility with old node-mixins + + root.applyCommon(vars.singleInstance, (if uid == 'node' then std.md5('nodes.json') else uid + '-overview'), tags, links { backToOverview+:: {} }, annotations, timezone, refresh, period), + 'network.json': + g.dashboard.new(prefix + 'network') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.linux.network, + rows.linux.networkSockets + + g.panel.row.withCollapsed(true), + rows.linux.networkNetstat + + g.panel.row.withCollapsed(true), + ], 12, 8 + ) + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-network', tags, links, annotations, timezone, refresh, period), + 'memory.json': + g.dashboard.new(prefix + 'memory') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.linux.memoryOverview, + rows.linux.memoryVmstat, + rows.linux.memoryMemstat, + ], 12, 8 + ) + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-memory', tags, links, annotations, timezone, refresh, period), + + 'system.json': + g.dashboard.new(prefix + 'CPU and system') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.linux.cpuAndSystem, + rows.linux.time, + ], 12, 7 + ) + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-system', tags, links, annotations, timezone, refresh, period), + 'disks.json': + g.dashboard.new(prefix + 'filesystem and disks') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.linux.filesystem, + rows.linux.disk, + ], 12, 8 + ) + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-disk', tags, links, annotations, timezone, refresh, period), + } + + + ( + if this.config.enableUseDashboards + then + + { + 'node-rsrc-use.json': + g.dashboard.new(prefix + 'USE method / node') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.use.cpuUseMethod, + rows.use.memoryUseMethod, + rows.use.networkUseMethod, + rows.use.diskUseMethod, + rows.use.filesystemUseMethod, + ], 12, 7 + ) + ) + ) + + root.applyCommon(this.grafana.variables.use.singleInstance, std.md5(uid + '-rsrc-use.json'), tags, links, annotations, timezone, refresh, period), + + 'node-cluster-rsrc-use.json': + g.dashboard.new(prefix + 'USE method / cluster') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.use.cpuUseClusterMethod, + rows.use.memoryUseClusterMethod, + rows.use.networkUseClusterMethod, + rows.use.diskUseClusterMethod, + rows.use.filesystemUseClusterMethod, + ], 12, 7 + ) + ) + ) + + root.applyCommon(this.grafana.variables.useCluster.singleInstance, std.md5(uid + '-cluster-rsrc-use.json'), tags, links, annotations, timezone, refresh, period), + } + + + ( + if this.config.showMultiCluster + then + { + 'node-multicluster-rsrc-use.json': + g.dashboard.new(prefix + 'USE method / multi-cluster') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + rows.use.cpuUseClusterMethodMulti, + rows.use.memoryUseClusterMethodMulti, + rows.use.networkUseClusterMethodMulti, + rows.use.diskUseClusterMethodMulti, + rows.use.filesystemUseClusterMethodMulti, + ], 12, 7 + ) + ) + ) + + root.applyCommon(this.grafana.variables.useCluster.multiInstance, std.md5(uid + '-multicluster-rsrc-use.json'), tags, links, annotations, timezone, refresh, period), + } + else {} + ) + else {} + ) + + + (if this.config.enableLokiLogs + then + { + 'logs.json': + logslib.new( + prefix + 'logs', + datasourceName=vars.datasources.loki.name, + datasourceRegex=vars.datasources.loki.regex, + filterSelector=this.config.logsFilteringSelector, + labels=this.config.groupLabels + this.config.instanceLabels + this.config.extraLogLabels, + formatParser=null, + showLogsVolume=this.config.showLogsVolume, + logsVolumeGroupBy=this.config.logsVolumeGroupBy, + extraFilters=this.config.logsExtraFilters + ) + { + dashboards+: + { + logs+: + // reference to self, already generated variables, to keep them, but apply other common data in applyCommon + root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links, annotations=annotations, timezone=timezone, refresh=refresh, period=period), + }, + panels+: + { + // modify log panel + logs+: + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, + variables+: { + // add prometheus datasource for annotations processing + toArray+: [ + vars.datasources.prometheus { hide: 2 }, + ], + }, + }.dashboards.logs, + } + else {}), + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/docs/node-observ-lib/linux/links.libsonnet b/docs/node-mixin/lib/linux/links.libsonnet similarity index 84% rename from docs/node-observ-lib/linux/links.libsonnet rename to docs/node-mixin/lib/linux/links.libsonnet index cc24910a65..83546520c3 100644 --- a/docs/node-observ-lib/linux/links.libsonnet +++ b/docs/node-mixin/lib/linux/links.libsonnet @@ -5,10 +5,10 @@ local commonlib = import 'common-lib/common/main.libsonnet'; { local link = g.dashboard.link, backToFleet: - link.link.new('Back to ' + this.config.dashboardNamePrefix + 'fleet', '/d/' + this.grafana.dashboards.fleet.uid) + link.link.new('Back to ' + this.config.dashboardNamePrefix + 'fleet', '/d/' + this.grafana.dashboards['fleet.json'].uid) + link.link.options.withKeepTime(true), backToOverview: - link.link.new('Back to ' + this.config.dashboardNamePrefix + 'overview', '/d/' + this.grafana.dashboards.overview.uid) + link.link.new('Back to ' + this.config.dashboardNamePrefix + 'overview', '/d/' + this.grafana.dashboards['nodes.json'].uid) + link.link.options.withKeepTime(true), otherDashboards: link.dashboards.new('All ' + this.config.dashboardNamePrefix + ' dashboards', this.config.dashboardTags) diff --git a/docs/node-mixin/lib/linux/main.libsonnet b/docs/node-mixin/lib/linux/main.libsonnet new file mode 100644 index 0000000000..f6db00f81c --- /dev/null +++ b/docs/node-mixin/lib/linux/main.libsonnet @@ -0,0 +1,58 @@ +local alerts = import './alerts/alerts.libsonnet'; +local annotations = import './annotations.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local datasources = import './datasources.libsonnet'; +local g = import './g.libsonnet'; +local links = import './links.libsonnet'; +local panels = import './panels/main.libsonnet'; +local rows = import './rows/main.libsonnet'; +local rules = import './rules/rules.libsonnet'; +local targets = import './targets/main.libsonnet'; +local variables = import './variables.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(): { + + local this = self, + config: config, + grafana: { + variables: variables.new(this), + targets: targets.new(this), + panels: panels.new(this), + annotations: annotations.new(this), + // common links here used across all dashboards + links: links.new(this), + rows: rows.new(this), + dashboards: dashboards.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: rules.new(this), + }, + + + }, + withConfigMixin(config): { + //backward compatible: handle both formats string and array for instanceLabels, groupLabels + local _patch = + ( + if std.objectHasAll(config, 'instanceLabels') + then + { instanceLabels: if std.isString(config.instanceLabels) then std.split(',', config.instanceLabels) else config.instanceLabels } + else {} + ) + + ( + if std.objectHasAll(config, 'groupLabels') + then + { + groupLabels: if std.isString(config.groupLabels) then std.split(',', config.groupLabels) else config.groupLabels, + } + else {} + ), + local groupLabels = if std.isString(config.groupLabels) then std.split(',', config.groupLabels) else config.groupLabels, + config+: config + _patch, + }, +} diff --git a/docs/node-mixin/lib/linux/panels/cpu.libsonnet b/docs/node-mixin/lib/linux/panels/cpu.libsonnet new file mode 100644 index 0000000000..aec88fda6a --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/cpu.libsonnet @@ -0,0 +1,39 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + + cpuCount: commonlib.panels.cpu.stat.count.new(targets=[t.cpu.cpuCount]), + cpuUsageTsPerCore: commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.cpu.cpuUsagePerCore]) + + g.panel.timeSeries.fieldConfig.defaults.custom.withStacking({ mode: 'normal' }), + + cpuUsageTopk: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='CPU usage', + target=t.cpu.cpuUsage, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards['nodes.json'].uid, + ), + cpuUsageStat: commonlib.panels.cpu.stat.usage.new(targets=[t.cpu.cpuUsage]), + cpuUsageByMode: commonlib.panels.cpu.timeSeries.utilizationByMode.new( + targets=[t.cpu.cpuUsageByMode], + description=||| + - System: Processes executing in kernel mode. + - User: Normal processes executing in user mode. + - Nice: Niced processes executing in user mode. + - Idle: Waiting for something to happen. + - Iowait: Waiting for I/O to complete. + - Irq: Servicing interrupts. + - Softirq: Servicing softirqs. + - Steal: Time spent in other operating systems when running in a virtualized environment. + ||| + ), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/disk.libsonnet b/docs/node-mixin/lib/linux/panels/disk.libsonnet new file mode 100644 index 0000000000..141ee98f8c --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/disk.libsonnet @@ -0,0 +1,122 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + + diskTotalRoot: + commonlib.panels.disk.stat.total.new( + 'Root mount size', + targets=[t.disk.diskTotalRoot], + description=||| + Total capacity on the primary mount point /. + ||| + ), + diskUsage: + commonlib.panels.disk.table.usage.new( + totalTarget= + ( + t.disk.diskTotal + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + ), + freeTarget= + t.disk.diskFree + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true), + groupLabel='mountpoint' + , + description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.' + ), + diskFreeTs: + commonlib.panels.disk.timeSeries.available.new( + 'Filesystem space availabe', + targets=[ + t.disk.diskFree, + ], + description='Filesystem space utilisation in bytes, by mountpoint.' + ), + diskInodesFree: + commonlib.panels.disk.timeSeries.base.new( + 'Free inodes', + targets=[t.disk.diskInodesFree], + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.' + ) + + g.panel.timeSeries.standardOptions.withUnit('short'), + diskInodesTotal: + commonlib.panels.disk.timeSeries.base.new( + 'Total inodes', + targets=[t.disk.diskInodesTotal], + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', + ) + + g.panel.timeSeries.standardOptions.withUnit('short'), + diskErrorsandRO: + commonlib.panels.disk.timeSeries.base.new( + 'Filesystems with errors / read-only', + targets=[ + t.disk.diskDeviceError, + t.disk.diskReadOnly, + ], + description='', + ) + + g.panel.timeSeries.standardOptions.withMax(1), + fileDescriptors: + commonlib.panels.disk.timeSeries.base.new( + 'File descriptors', + targets=[ + t.disk.processMaxFds, + t.disk.processOpenFds, + ], + description=||| + File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. + The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. + ||| + ), + diskUsagePercentTopK: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk space usage', + target=t.disk.diskUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards['nodes.json'].uid, + ), + diskIOBytesPerSec: commonlib.panels.disk.timeSeries.ioBytesPerSec.new( + targets=[t.disk.diskIOreadBytesPerSec, t.disk.diskIOwriteBytesPerSec, t.disk.diskIOutilization] + ), + diskIOutilPercentTopK: + commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk IO', + target=t.disk.diskIOutilization, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards['nodes.json'].uid, + ), + diskIOps: + commonlib.panels.disk.timeSeries.iops.new( + targets=[ + t.disk.diskIOReads, + t.disk.diskIOWrites, + ] + ), + + diskQueue: + commonlib.panels.disk.timeSeries.ioQueue.new( + 'Disk average queue', + targets= + [ + t.disk.diskAvgQueueSize, + ] + ), + diskIOWaitTime: commonlib.panels.disk.timeSeries.ioWaitTime.new( + targets=[ + t.disk.diskIOWaitReadTime, + t.disk.diskIOWaitWriteTime, + ] + ), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/fleet.libsonnet b/docs/node-mixin/lib/linux/panels/fleet.libsonnet new file mode 100644 index 0000000000..e75096e4e0 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/fleet.libsonnet @@ -0,0 +1,234 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + fleetOverviewTable: + commonlib.panels.generic.table.base.new( + 'Fleet overview', + targets= + [ + t.system.osInfoCombined + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('OS Info'), + t.system.uptime + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Uptime'), + t.system.systemLoad1 + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Load 1'), + t.cpu.cpuCount + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Cores'), + t.cpu.cpuUsage + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CPU usage'), + t.memory.memoryTotalBytes + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory total'), + t.memory.memoryUsagePercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory usage'), + t.disk.diskTotalRoot + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Root mount size'), + t.disk.diskUsageRootPercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Root mount used'), + t.alerts.alertsCritical + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CRITICAL'), + t.alerts.alertsWarning + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('WARNING'), + ], + description="All nodes' perfomance at a glance." + ) + + g.panel.table.options.withFooter( + value={ + reducer: ['sum'], + show: true, + fields: [ + 'Value #Cores', + 'Value #Load 1', + 'Value #Memory total', + 'Value #Root mount size', + ], + } + ) + + commonlib.panels.system.table.uptime.stylizeByName('Uptime') + + table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Product|^Hostname$') + + fieldOverride.byRegexp.withProperty('custom.filterable', true), + fieldOverride.byName.new('Instance') + + fieldOverride.byName.withProperty('custom.filterable', true) + + fieldOverride.byName.withProperty('links', [ + { + targetBlank: false, + title: 'Drill down to ${__field.name} ${__value.text}', + url: 'd/%s?var-%s=${__data.fields.%s}&${__url_time_range}&${datasource:queryparam}' % [this.grafana.dashboards['nodes.json'].uid, instanceLabel, instanceLabel], + }, + ]), + fieldOverride.byRegexp.new(std.join('|', std.map(utils.toSentenceCase, this.config.groupLabels))) + + fieldOverride.byRegexp.withProperty('custom.filterable', true) + + fieldOverride.byRegexp.withProperty('links', [ + { + targetBlank: false, + title: 'Filter by ${__field.name}', + url: 'd/%s?var-${__field.name}=${__value.text}&${__url_time_range}&${datasource:queryparam}' % [this.grafana.dashboards['fleet.json'].uid], + }, + ]), + fieldOverride.byName.new('Cores') + + fieldOverride.byName.withProperty('custom.width', 120), + fieldOverride.byName.new('CPU usage') + + fieldOverride.byName.withProperty('custom.width', 120) + + fieldOverride.byName.withProperty( + 'custom.cellOptions', { + type: 'gauge', + mode: 'basic', + valueDisplayMode: 'text', + } + ) + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Memory total') + + fieldOverride.byName.withProperty('custom.width', 120) + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Memory usage') + + fieldOverride.byName.withProperty('custom.width', 120) + + fieldOverride.byName.withProperty( + 'custom.cellOptions', { + type: 'gauge', + mode: 'basic', + valueDisplayMode: 'text', + } + ) + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Root mount size') + + fieldOverride.byName.withProperty('custom.width', 120) + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Root mount used') + + fieldOverride.byName.withProperty('custom.width', 120) + + fieldOverride.byName.withProperty( + 'custom.cellOptions', { + type: 'gauge', + mode: 'basic', + valueDisplayMode: 'text', + } + ) + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('percent') + ) + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + ]) + + table.queryOptions.withTransformationsMixin( + [ + { + id: 'joinByField', + options: { + byField: instanceLabel, + mode: 'outer', + }, + }, + { + id: 'filterFieldsByName', + options: { + include: { + //' 1' - would only match first occurence of group label, so no duplicates + pattern: instanceLabel + '|' + + + std.join( + '|', + std.map( + function(x) '%s 1' % x, this.config.instanceLabels + ) + ) + + '|' + + std.join( + '|', + std.map( + function(x) '%s 1' % x, this.config.groupLabels + ) + ) + + '|product|^hostname$|^nodename$|^pretty_name$|Value.+', + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Value #OS Info': true, + }, + indexByName: + { + [instanceLabel]: 0, + nodename: 1, + hostname: 1, + pretty_name: 2, + product: 2, + } + + + // group labels are named as 'job 1' and so on. + { + [label]: 3 + for label in this.config.groupLabels + }, + renameByName: + { + [label + ' 1']: utils.toSentenceCase(label) + for label in this.config.instanceLabels + } + { + [instanceLabel]: utils.toSentenceCase(instanceLabel), + product: 'OS', // windows + pretty_name: 'OS', // linux + hostname: 'Hostname', // windows + nodename: 'Hostname', // Linux + } + + + // group labels are named as 'job 1' and so on. + { + [label + ' 1']: utils.toSentenceCase(label) + for label in this.config.groupLabels + }, + + }, + }, + { + id: 'renameByRegex', + options: { + regex: 'Value #(.*)', + renamePattern: '$1', + }, + }, + ] + ), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/hardware.libsonnet b/docs/node-mixin/lib/linux/panels/hardware.libsonnet new file mode 100644 index 0000000000..08a4c4b35a --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/hardware.libsonnet @@ -0,0 +1,18 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + hardwareTemperature: + commonlib.panels.hardware.timeSeries.temperature.new( + 'Temperature', + targets=[t.hardware.hardwareTemperature] + ), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/main.libsonnet b/docs/node-mixin/lib/linux/panels/main.libsonnet new file mode 100644 index 0000000000..767dc82e08 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/main.libsonnet @@ -0,0 +1,14 @@ +{ + new(config):: { + cpu: (import './cpu.libsonnet').new(config), + disk: (import './disk.libsonnet').new(config), + fleet: (import './fleet.libsonnet').new(config), + hardware: (import './hardware.libsonnet').new(config), + memory: (import './memory.libsonnet').new(config), + network: (import './network.libsonnet').new(config), + system: (import './system.libsonnet').new(config), + use: (import './use.libsonnet').new(config), + useCluster: (import './useCluster.libsonnet').new(config), + useClusterMulti: (import './useClusterMulti.libsonnet').new(config), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/memory.libsonnet b/docs/node-mixin/lib/linux/panels/memory.libsonnet new file mode 100644 index 0000000000..8560a73500 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/memory.libsonnet @@ -0,0 +1,279 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + + memoryTotalBytes: commonlib.panels.memory.stat.total.new(targets=[t.memory.memoryTotalBytes]), + memorySwapTotalBytes: + commonlib.panels.memory.stat.total.new( + 'Total swap', + targets=[t.memory.memorySwapTotal], + description=||| + Total swap available. + + Swap is a space on a storage device (usually a dedicated swap partition or a swap file) + used as virtual memory when the physical RAM (random-access memory) is fully utilized. + Swap space helps prevent memory-related performance issues by temporarily transferring less-used data from RAM to disk, + freeing up physical memory for active processes and applications. + ||| + ), + memoryUsageStatPercent: commonlib.panels.memory.stat.usage.new(targets=[t.memory.memoryUsagePercent]), + memotyUsageTopKPercent: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Memory usage', + target=t.memory.memoryUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards['nodes.json'].uid, + ), + memoryUsageTsBytes: + commonlib.panels.memory.timeSeries.usageBytes.new( + targets=[ + t.memory.memoryUsedBytes, + t.memory.memoryCachedBytes, + t.memory.memoryAvailableBytes, + t.memory.memoryBuffersBytes, + t.memory.memoryFreeBytes, + t.memory.memoryTotalBytes, + ], + description= + ||| + - Used: The amount of physical memory currently in use by the system. + - Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. + - Free: The amount of physical memory that is currently not in use. + - Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. + - Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. + ||| + ) + + g.panel.timeSeries.standardOptions.withOverridesMixin( + { + __systemRef: 'hideSeriesFrom', + matcher: { + id: 'byNames', + options: { + mode: 'exclude', + names: [ + t.memory.memoryTotalBytes.legendFormat, + t.memory.memoryUsedBytes.legendFormat, + ], + prefix: 'All except:', + readOnly: true, + }, + }, + properties: [ + { + id: 'custom.hideFrom', + value: { + viz: true, + legend: false, + tooltip: false, + }, + }, + ], + } + ), + + memoryPagesInOut: + commonlib.panels.memory.timeSeries.base.new( + 'Memory pages in / out', + targets=[t.memory.memoryPagesIn, t.memory.memoryPagesOut], + description=||| + Page-In - Return of pages to physical memory. This is a common and normal event. + + Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. + When the kernel detects low memory, it attempts to free memory by paging out. + While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. + Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. + ||| + ) + + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), + + memoryPagesSwapInOut: + commonlib.panels.memory.timeSeries.base.new( + 'Memory pages swapping in / out', + targets=[t.memory.memoryPagesSwapIn, t.memory.memoryPagesSwapOut], + description=||| + Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. + Nonetheless, it is a preferable option to crashing or killing off processes. + + The process of writing pages out to disk to free memory is known as swapping-out. + If a page fault occurs because the page is on disk, in the swap area, rather than in memory, + the kernel will read the page back in from the disk to satisfy the page fault. + This is known as swapping-in. + ||| + ) + + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), + + memoryPagesFaults: + commonlib.panels.memory.timeSeries.base.new( + 'Memory page faults', + targets=[t.memory.memoryPageMajorFaults, t.memory.memoryPageMinorFaults], + description=||| + A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, + requiring a mapping to be added to the process's virtual address space. + + The page contents may also need to be loaded from a backing store such as a disk. + While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. + Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. + |||, + ), + + memoryOOMkiller: + commonlib.panels.memory.timeSeries.base.new( + 'OOM Killer', + targets=[t.events.memoryOOMkiller], + description=||| + Out Of Memory killer is a process used by the Linux kernel when the system is running critically low on memory. + + This can happen when the kernel has allocated more memory than is available for its processes. + ||| + ), + + memoryActiveInactive: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory active / inactive', + targets=[t.memory.memoryActiveBytes, t.memory.memoryInactiveBytes], + description=||| + - Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. + - Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. + |||, + ), + + memoryActiveInactiveDetail: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory active / inactive details', + targets=[t.memory.memoryInactiveFile, t.memory.memoryInactiveAnon, t.memory.memoryActiveFile, t.memory.memoryActiveAnon], + description=||| + - Inactive_file: File-backed memory on inactive LRU list. + - Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). + - Active_file: File-backed memory on active LRU list. + - Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. + |||, + ), + + memoryCommited: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory commited', + targets=[t.memory.memoryCommitedAs, t.memory.memoryCommitedLimit], + description=||| + - Committed_AS - Amount of memory presently allocated on the system. + - CommitLimit - Amount of memory currently available to be allocated on the system. + ||| + ), + + memorySharedAndMapped: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory shared and mapped', + targets=[t.memory.memoryMappedBytes, t.memory.memoryShmemBytes, t.memory.memoryShmemPmdMappedBytes, t.memory.memoryShmemHugePagesBytes], + description=||| + - Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. + - Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. + - ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. + - ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. + ||| + ), + memoryWriteAndDirty: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory writeback and dirty', + targets=[t.memory.memoryWriteback, t.memory.memoryWritebackTmp, t.memory.memoryDirty], + description=||| + - Writeback: This refers to the memory that is currently being actively written back to the disk. + - WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. + - Dirty: This type of memory is waiting to be written back to the disk. + ||| + ), + memoryVmalloc: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory Vmalloc', + targets=[t.memory.memoryVmallocChunk, t.memory.memoryVmallocTotal, t.memory.memoryVmallocUsed], + description=||| + Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. + + - VmallocChunk: Largest contiguous block of vmalloc area which is free. + - VmallocTotal: Total size of vmalloc memory area. + - VmallocUsed: Amount of vmalloc area which is used. + ||| + ), + memorySlab: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory slab', + targets=[t.memory.memorySlabSUnreclaim, t.memory.memorySlabSReclaimable], + description=||| + Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. + + The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. + + - SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. + - SReclaimable: Part of Slab, that might be reclaimed, such as caches. + ||| + ), + memoryAnonymous: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory anonymous', + targets=[t.memory.memoryAnonHugePages, t.memory.memoryAnonPages], + description=||| + Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. + + This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. + + Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, + and from Memory Shared with other processes, + which refers to memory regions that can be accessed and modified by multiple processes. + + - AnonHugePages: Memory in anonymous huge pages. + - AnonPages: Memory in user pages not backed by files. + ||| + ), + + memoryHugePagesCounter: + commonlib.panels.memory.timeSeries.base.new( + 'Memory HugePages counter', + targets=[t.memory.memoryHugePages_Free, t.memory.memoryHugePages_Rsvd, t.memory.memoryHugePages_Surp], + description= + ||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + + - HugePages_Free: Huge pages in the pool that are not yet allocated. + - HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. + - HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. + ||| + ), + memoryHugePagesSize: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory HugePages size', + targets=[t.memory.memoryHugePagesTotalSize, t.memory.memoryHugePagesSize], + + description=||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + ||| + ), + + memoryDirectMap: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory direct map', + targets=[t.memory.memoryDirectMap1G, t.memory.memoryDirectMap2M, t.memory.memoryDirectMap4k], + + description=||| + Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. + ||| + ), + memoryBounce: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory bounce', + targets=[t.memory.memoryBounce], + description=||| + Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. + + To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. + + - Bounce: Memory used for block device bounce buffers. + ||| + ), + }, +} diff --git a/docs/node-mixin/lib/linux/panels/network.libsonnet b/docs/node-mixin/lib/linux/panels/network.libsonnet new file mode 100644 index 0000000000..3c884eab9c --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/network.libsonnet @@ -0,0 +1,490 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + + networkErrorsAndDroppedPerSec: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors and dropped packets', + targets=std.map( + function(t) t + { + expr: t.expr + '>0', + }, + [ + t.network.networkOutErrorsPerSec, + t.network.networkInErrorsPerSec, + t.network.networkOutDroppedPerSec, + t.network.networkInDroppedPerSec, + ] + ), + description=||| + **Network errors**: + + Network errors refer to issues that occur during the transmission of data across a network. + + These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. + + Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. + + **Dropped packets**: + + Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. + + Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. + + Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. + ||| + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkErrorsAndDroppedPerSecTopK: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors and dropped packets', + targets=std.map( + function(t) t + { + expr: 'topk(25, ' + t.expr + ')>0', + legendFormat: '{{' + this.config.instanceLabels[0] + '}}: ' + std.get(t, 'legendFormat', '{{ nic }}'), + }, + [ + t.network.networkOutErrorsPerSec, + t.network.networkInErrorsPerSec, + t.network.networkOutDroppedPerSec, + t.network.networkInDroppedPerSec, + ] + ), + description=||| + Top 25. + + **Network errors**: + + Network errors refer to issues that occur during the transmission of data across a network. + + These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. + + Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. + + **Dropped packets**: + + Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. + + Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. + + Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. + ||| + ) + + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('points') + + g.panel.timeSeries.fieldConfig.defaults.custom.withPointSize(5), + + networkErrorsPerSec: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors', + targets=[t.network.networkInErrorsPerSec, t.network.networkOutErrorsPerSec] + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkDroppedPerSec: + commonlib.panels.network.timeSeries.dropped.new( + targets=[t.network.networkInDroppedPerSec, t.network.networkOutDroppedPerSec] + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkUsagePerSec: + commonlib.panels.network.timeSeries.traffic.new( + targets=[t.network.networkInBitPerSecFiltered, t.network.networkOutBitPerSecFiltered] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkPacketsPerSec: + commonlib.panels.network.timeSeries.packets.new( + targets=[t.network.networkInPacketsPerSec, t.network.networkOutPacketsPerSec] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkMulticastPerSec: + commonlib.panels.network.timeSeries.multicast.new( + 'Multicast packets', + targets=[t.network.networkInMulticastPacketsPerSec, t.network.networkOutMulticastPacketsPerSec], + description='Multicast packets received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + + networkFifo: + commonlib.panels.network.timeSeries.packets.new( + 'Network FIFO', + targets=[t.network.networkFifoInPerSec, t.network.networkFifoOutPerSec], + description=||| + Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. + It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. + Packets are stored in the FIFO buffer until they can be transmitted or processed further. + ||| + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkCompressedPerSec: + commonlib.panels.network.timeSeries.packets.new( + 'Compressed packets', + targets=[t.network.networkCompressedInPerSec, t.network.networkCompressedOutPerSec], + description=||| + - Compressed received: + Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + - Compressed transmitted: + Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + https://docs.kernel.org/networking/statistics.html + |||, + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkNFConntrack: + commonlib.panels.generic.timeSeries.base.new( + 'NF conntrack', + targets=[t.network.networkNFConntrackEntries, t.network.networkNFConntrackLimits], + description=||| + NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, + enforce firewall rules, perform NAT, and manage network address/port translation. + ||| + ) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0), + + networkSoftnet: + commonlib.panels.network.timeSeries.packets.new( + 'Softnet packets', + targets=[t.network.networkSoftnetProcessedPerSec, t.network.networkSoftnetDroppedPerSec], + description=||| + Softnet packets are received by the network and queued for processing by the kernel's networking stack. + Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. + ||| + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets('/dropped/') + + g.panel.timeSeries.fieldConfig.defaults.custom.withAxisLabel('Dropped(-) | Processed(+)'), + networkSoftnetSqueeze: + commonlib.panels.network.timeSeries.packets.new( + 'Softnet out of quota', + targets=[t.network.networkSoftnetSqueezedPerSec], + description=||| + "Softnet out of quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. + This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. + ||| + ), + networkOperStatus: + commonlib.panels.network.statusHistory.interfaceStatus.new( + 'Network interfaces carrier status', + targets=[t.network.networkCarrier], + description='Network interfaces carrier status', + ), + networkOverviewTable: + commonlib.panels.generic.table.base.new( + 'Network interfaces overview', + targets= + [ + t.network.networkUp + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Up'), + t.network.networkCarrier + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Carrier'), + t.network.networkOutBitPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(false) + + g.query.prometheus.withRefId('Transmitted'), + t.network.networkInBitPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(false) + + g.query.prometheus.withRefId('Received'), + t.network.networkArpEntries + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('ARP entries'), + t.network.networkMtuBytes + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('MTU'), + t.network.networkSpeedBitsPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Speed'), + t.network.networkTransmitQueueLength + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Queue length'), + t.network.networkInfo + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Info'), + ], + description='Network interfaces overview.' + ) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byName.new('Interface') + + fieldOverride.byName.withProperty('custom.filterable', true), + ]) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byName.new('Speed') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bps') + ), + ]) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Transmitted|Received') + + fieldOverride.byRegexp.withProperty( + 'custom.cellOptions', { + type: 'gauge', + mode: 'gradient', + valueDisplayMode: 'text', + } + ) + + fieldOverride.byRegexp.withPropertiesFromOptions( + table.standardOptions.withUnit('bps') + + table.standardOptions.color.withMode('continuous-BlYlRd') + + table.standardOptions.withMax(1000 * 1000 * 100) + ), + ]) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Carrier|Up') + + fieldOverride.byRegexp.withProperty( + 'custom.cellOptions', { + type: 'color-text', + } + ) + + fieldOverride.byRegexp.withPropertiesFromOptions( + table.standardOptions.withMappings( + { + type: 'value', + options: { + '0': { + text: 'Down', + color: 'light-red', + index: 0, + }, + '1': { + text: 'Up', + color: 'light-green', + index: 1, + }, + }, + } + ), + ), + ]) + + table.queryOptions.withTransformationsMixin( + [ + { + id: 'joinByField', + options: { + byField: 'device', + mode: 'outer', + }, + }, + { + id: 'filterFieldsByName', + options: { + include: { + pattern: 'device|duplex|address|Value.+', + }, + }, + }, + { + id: 'renameByRegex', + options: { + regex: '(Value) #(.*)', + renamePattern: '$2', + }, + }, + { + id: 'organize', + options: { + excludeByName: { + Info: true, + }, + renameByName: + { + device: 'Interface', + duplex: 'Duplex', + address: 'Address', + }, + }, + }, + { + id: 'organize', + options: { + indexByName: { + Interface: 0, + Up: 1, + Carrier: 2, + Received: 3, + Transmitted: 4, + }, + }, + }, + ] + ), + networkSockstatAll: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets in use', + targets=[t.network.networkSocketsUsed], + description='Number of sockets currently in use.', + ), + + networkSockstatTCP: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets TCP', + targets=[t.network.networkSocketsTCPAllocated, t.network.networkSocketsTCPIPv4, t.network.networkSocketsTCPIPv6, t.network.networkSocketsTCPOrphans, t.network.networkSocketsTCPTimeWait], + description=||| + TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. + + Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. + ||| + ), + networkSockstatUDP: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets UDP', + targets=[t.network.networkSocketsUDPLiteInUse, t.network.networkSocketsUDPInUse, t.network.networkSocketsUDPLiteIPv6InUse, t.network.networkSocketsUDPIPv6InUse], + description=||| + UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. + Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. + ||| + ), + networkSockstatOther: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets other', + targets=[t.network.networkSocketsFragInUse, t.network.networkSocketsFragIPv6InUse, t.network.networkSocketsRawInUse, t.network.networkSocketsIPv6RawInUse], + description=||| + FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. + + RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. + ||| + ), + networkSockstatMemory: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.generic.timeSeries.base.new( + title='Sockets memory', + targets=[t.network.networkSocketsTCPMemoryPages, t.network.networkSocketsUDPMemoryPages, t.network.networkSocketsTCPMemoryBytes, t.network.networkSocketsUDPMemoryBytes], + description=||| + Memory currently in use for sockets. + |||, + ) + + panel.queryOptions.withMaxDataPoints(100) + + panel.fieldConfig.defaults.custom.withAxisLabel('Pages') + + panel.standardOptions.withOverridesMixin( + panel.standardOptions.override.byRegexp.new('/bytes/') + + override.byType.withPropertiesFromOptions( + panel.standardOptions.withDecimals(2) + + panel.standardOptions.withUnit('bytes') + + panel.fieldConfig.defaults.custom.withDrawStyle('bars') + + panel.fieldConfig.defaults.custom.withStacking(value={ mode: 'normal', group: 'A' }) + + panel.fieldConfig.defaults.custom.withAxisLabel('Bytes') + ) + ), + + networkNetstatIP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'IP octets', + targets=[t.network.networkNetstatIPInOctetsPerSec, t.network.networkNetstatIPOutOctetsPerSec], + description='Rate of IP octets received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('oct/s'), + + networkNetstatTCP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'TCP segments', + targets=[t.network.networkNetstatTCPInSegmentsPerSec, t.network.networkNetstatTCPOutSegmentsPerSec], + description='Rate of TCP segments received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('seg/s'), + + networkNetstatTCPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='TCP errors rate', + targets=[ + t.network.networkNetstatTCPOverflowPerSec, + t.network.networkNetstatTCPListenDropsPerSec, + t.network.networkNetstatTCPRetransPerSec, + t.network.networkNetstatTCPRetransSegPerSec, + t.network.networkNetstatTCPInWithErrorsPerSec, + t.network.networkNetstatTCPOutWithRstPerSec, + ], + description='Rate of TCP errors.' + ) + + panel.standardOptions.withUnit('err/s'), + + networkNetstatUDP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'UDP datagrams', + targets=[ + t.network.networkNetstatIPInUDPPerSec, + t.network.networkNetstatIPOutUDPPerSec, + t.network.networkNetstatIPInUDP6PerSec, + t.network.networkNetstatIPOutUDP6PerSec, + ], + description='Rate of UDP datagrams received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('dat/s'), + + networkNetstatUDPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='UDP errors rate', + targets=[ + t.network.networkNetstatUDPLiteInErrorsPerSec, + t.network.networkNetstatUDPInErrorsPerSec, + t.network.networkNetstatUDP6InErrorsPerSec, + t.network.networkNetstatUDPNoPortsPerSec, + t.network.networkNetstatUDP6NoPortsPerSec, + t.network.networkNetstatUDPRcvBufErrsPerSec, + t.network.networkNetstatUDP6RcvBufErrsPerSec, + t.network.networkNetstatUDPSndBufErrsPerSec, + t.network.networkNetstatUDP6SndBufErrsPerSec, + ], + description='Rate of UDP errors.' + ) + + panel.standardOptions.withUnit('err/s'), + + networkNetstatICMP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'ICMP messages', + targets=[ + t.network.networkNetstatICMPInPerSec, + t.network.networkNetstatICMPOutPerSec, + t.network.networkNetstatICMP6InPerSec, + t.network.networkNetstatICMP6OutPerSec, + ], + description="Rate of ICMP messages, like 'ping', received and transmitted." + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('msg/s'), + + networkNetstatICMPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='ICMP errors rate', + targets=[ + t.network.networkNetstatICMPInErrorsPerSec, + t.network.networkNetstatICM6PInErrorsPerSec, + ], + description='Rate of ICMP messages received and transmitted with errors.' + ) + + panel.standardOptions.withUnit('err/s'), + + }, +} diff --git a/docs/node-mixin/lib/linux/panels/system.libsonnet b/docs/node-mixin/lib/linux/panels/system.libsonnet new file mode 100644 index 0000000000..b1c6464dce --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/system.libsonnet @@ -0,0 +1,82 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], + + uptime: commonlib.panels.system.stat.uptime.new(targets=[t.system.uptime]), + + systemLoad: + commonlib.panels.system.timeSeries.loadAverage.new( + loadTargets=[t.system.systemLoad1, t.system.systemLoad5, t.system.systemLoad15], + cpuCountTarget=t.cpu.cpuCount, + ), + + systemContextSwitchesAndInterrupts: + commonlib.panels.generic.timeSeries.base.new( + 'Context switches/Interrupts', + targets=[ + t.system.systemContextSwitches, + t.system.systemInterrupts, + ], + description=||| + Context switches occur when the operating system switches from running one process to another. Interrupts are signals sent to the CPU by external devices to request its attention. + + A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. + ||| + ), + + timeNtpStatus: + commonlib.panels.system.statusHistory.ntp.new( + 'NTP status', + targets=[t.system.timeNtpStatus], + description='Status of time synchronization.' + ) + + g.panel.timeSeries.standardOptions.withNoValue('No data.') + + g.panel.statusHistory.options.withLegend(false), + timeSyncDrift: + commonlib.panels.generic.timeSeries.base.new( + 'Time synchronized drift', + targets=[ + t.system.timeEstimatedError, + t.system.timeOffset, + t.system.timeMaxError, + ], + description=||| + Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. + ||| + ) + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.standardOptions.withNoValue('No data.'), + osInfo: commonlib.panels.generic.stat.info.new( + 'OS', + targets=[t.system.osInfo], + description='Operating system' + ) + { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, + kernelVersion: + commonlib.panels.generic.stat.info.new('Kernel version', + targets=[t.system.unameInfo], + description='Kernel version of linux host.') + { options+: { reduceOptions+: { fields: '/^release$/' } } }, + osTimezone: + commonlib.panels.generic.stat.info.new( + 'Timezone', targets=[t.system.osTimezone], description='Current system timezone.' + ) + { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, + hostname: + commonlib.panels.generic.stat.info.new( + 'Hostname', + targets=[t.system.unameInfo], + description="System's hostname." + ) + { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, + + }, +} diff --git a/docs/node-mixin/lib/linux/panels/use.libsonnet b/docs/node-mixin/lib/linux/panels/use.libsonnet new file mode 100644 index 0000000000..4f7a7511a4 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/use.libsonnet @@ -0,0 +1,88 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + + + //for USE + cpuUtilization: + commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.use.cpuUtilization]) + + g.panel.timeSeries.panelOptions.withTitle('CPU utilization') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + cpuSaturation: + commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.use.cpuSaturation]) + + g.panel.timeSeries.panelOptions.withTitle('CPU saturation (Load1 per CPU)') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.panelOptions.withDescription( + ||| + System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The value is as a percent compared to the number of CPU cores for the node. + ||| + ) + + { title: 'CPU saturation (Load 1 per CPU)' } + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + + memoryUtilization: + commonlib.panels.memory.timeSeries.usagePercent.new(targets=[t.use.memoryUtilization]) + + g.panel.timeSeries.panelOptions.withTitle('Memory utilization') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + memorySaturation: + commonlib.panels.memory.timeSeries.base.new( + 'Memory saturation (Major page faults)', + targets=[t.use.memorySaturation], + ) + + g.panel.timeSeries.panelOptions.withDescription(this.grafana.panels.memory.memoryPagesFaults.description) + + g.panel.timeSeries.panelOptions.withTitle('Memory saturation (Major page faults)') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + + networkUtilization: + this.grafana.panels.network.networkUsagePerSec + + g.panel.timeSeries.panelOptions.withTitle('Network utilization (Bytes receive/transmit)') + + g.panel.timeSeries.queryOptions.withTargets([t.use.networkUtilizationReceive, t.use.networkUtilizationTransmit]) + + commonlib.panels.network.timeSeries.base.withNegateOutPackets('/Transmit/') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + networkSaturation: + this.grafana.panels.network.networkDroppedPerSec + + g.panel.timeSeries.panelOptions.withTitle('Network saturation (Drops receive/transmit)') + + g.panel.timeSeries.queryOptions.withTargets([t.use.networkSaturationReceive, t.use.networkSaturationTransmit]) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard') + + g.panel.timeSeries.options.legend.withShowLegend(false), + + + diskUtilization: + commonlib.panels.generic.timeSeries.base.new( + 'Disk IO utilization', targets=[t.use.diskUtilization], description='Disk total IO seconds' + ) + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.standardOptions.withUnit('percent') + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + + diskSaturation: + commonlib.panels.generic.timeSeries.base.new( + 'Disk IO saturation', targets=[t.use.diskSaturation], description='Disk saturation (weighted seconds spent, 1 second rate)' + ) + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.standardOptions.withUnit('percent') + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard'), + + filesystemUtilization: + this.grafana.panels.disk.diskFreeTs + + g.panel.timeSeries.panelOptions.withTitle('Filesytem utilization') + + g.panel.timeSeries.options.legend.withShowLegend(false) + + g.panel.timeSeries.queryOptions.withTargets([t.use.filesystemUtilization]) + + g.panel.timeSeries.standardOptions.withUnit('percent') + + g.panel.timeSeries.standardOptions.withMax(100) + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('standard') + + g.panel.timeSeries.panelOptions.withDescription('Total disk utilization percent.'), + + }, +} diff --git a/docs/node-mixin/lib/linux/panels/useCluster.libsonnet b/docs/node-mixin/lib/linux/panels/useCluster.libsonnet new file mode 100644 index 0000000000..9ae1a79e75 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/useCluster.libsonnet @@ -0,0 +1,49 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + local instancePanels = this.grafana.panels.use, + //for USE + cpuUtilization: + instancePanels.cpuUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuUtilization]), + cpuSaturation: + instancePanels.cpuSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuSaturation]), + + memoryUtilization: + instancePanels.memoryUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.memoryUtilization]), + + memorySaturation: + instancePanels.memorySaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.memorySaturation]), + + networkUtilization: + instancePanels.networkUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.networkUtilizationReceive, t.useCluster.networkUtilizationTransmit]), + networkSaturation: + instancePanels.networkSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.networkSaturationReceive, t.useCluster.networkSaturationTransmit]), + + + diskUtilization: + instancePanels.diskUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.diskUtilization]), + + diskSaturation: + instancePanels.diskSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.diskSaturation]), + + filesystemUtilization: + instancePanels.filesystemUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.filesystemUtilization]), + + }, +} diff --git a/docs/node-mixin/lib/linux/panels/useClusterMulti.libsonnet b/docs/node-mixin/lib/linux/panels/useClusterMulti.libsonnet new file mode 100644 index 0000000000..02232237e9 --- /dev/null +++ b/docs/node-mixin/lib/linux/panels/useClusterMulti.libsonnet @@ -0,0 +1,46 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instancePanels = this.grafana.panels.use, + //for USE + cpuUtilization: instancePanels.cpuUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.cpuUtilization]), + cpuSaturation: instancePanels.cpuSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.cpuSaturation]), + + memoryUtilization: + instancePanels.memoryUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.memoryUtilization]), + + memorySaturation: + instancePanels.memorySaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.memorySaturation]), + + networkUtilization: + instancePanels.networkUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.networkUtilizationReceive, t.useClusterMulti.networkUtilizationTransmit]), + networkSaturation: + instancePanels.networkSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.networkSaturationReceive, t.useClusterMulti.networkSaturationTransmit]), + + + diskUtilization: + instancePanels.diskUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.diskUtilization]), + + diskSaturation: + instancePanels.diskSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.diskSaturation]), + + filesystemUtilization: + instancePanels.filesystemUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useClusterMulti.filesystemUtilization]), + + }, +} diff --git a/docs/node-mixin/lib/linux/rows/linux.libsonnet b/docs/node-mixin/lib/linux/rows/linux.libsonnet new file mode 100644 index 0000000000..17d357b708 --- /dev/null +++ b/docs/node-mixin/lib/linux/rows/linux.libsonnet @@ -0,0 +1,286 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +{ + new(this): + { + local panels = this.grafana.panels, + + fleet: + g.panel.row.new('Fleet overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + // g.panel.row.new("Overview"), + panels.fleet.fleetOverviewTable { gridPos+: { w: 24, h: 16 } }, + panels.cpu.cpuUsageTopk { gridPos+: { w: 24 } }, + panels.memory.memotyUsageTopKPercent { gridPos+: { w: 24 } }, + panels.disk.diskIOutilPercentTopK { gridPos+: { w: 12 } }, + panels.disk.diskUsagePercentTopK { gridPos+: { w: 12 } }, + panels.network.networkErrorsAndDroppedPerSecTopK { gridPos+: { w: 24 } }, + ] + ), + overview: + g.panel.row.new('Overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + panels.system.uptime { gridPos+: { w: 6, h: 2 } }, + panels.system.hostname { gridPos+: { w: 6, h: 2 } }, + panels.system.kernelVersion { gridPos+: { w: 6, h: 2 } }, + panels.system.osInfo { gridPos+: { w: 6, h: 2 } }, + panels.cpu.cpuCount { gridPos+: { w: 6, h: 2 } }, + panels.memory.memoryTotalBytes { gridPos+: { w: 6, h: 2 } }, + panels.memory.memorySwapTotalBytes { gridPos+: { w: 6, h: 2 } }, + panels.disk.diskTotalRoot { gridPos+: { w: 6, h: 2 } }, + ] + ), + time: + g.panel.row.new('Time') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.system.osTimezone { gridPos+: { w: 3, h: 4 } }, + panels.system.timeNtpStatus { gridPos+: { x: 0, y: 0, w: 21, h: 4 } }, + panels.system.timeSyncDrift { gridPos+: { w: 24, h: 7 } }, + ]), + cpuOverview: + g.panel.row.new('CPU') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + panels.cpu.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpu.cpuUsageTsPerCore { gridPos+: { w: 12, h: 6 } }, + panels.system.systemLoad { gridPos+: { w: 6, h: 6 } }, + ] + ), + cpuAndSystem: + g.panel.row.new('System') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.cpu.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpu.cpuUsageTsPerCore { gridPos+: { w: 9, h: 6 } }, + panels.cpu.cpuUsageByMode { gridPos+: { w: 9, h: 6 } }, + panels.system.systemLoad, + panels.system.systemContextSwitchesAndInterrupts, + ]), + memoryOverview: + g.panel.row.new('Memory') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + panels.memory.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, + panels.memory.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, + ] + ), + memoryVmstat: + g.panel.row.new('Vmstat') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + panels.memory.memoryPagesInOut, + panels.memory.memoryPagesSwapInOut, + panels.memory.memoryPagesFaults, + panels.memory.memoryOOMkiller, + ] + ), + memoryMemstat: + g.panel.row.new('Memstat') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + panels.memory.memoryActiveInactive, + panels.memory.memoryActiveInactiveDetail, + panels.memory.memoryCommited, + panels.memory.memorySharedAndMapped, + panels.memory.memoryWriteAndDirty, + panels.memory.memoryVmalloc, + panels.memory.memorySlab, + panels.memory.memoryAnonymous, + panels.memory.memoryHugePagesCounter, + panels.memory.memoryHugePagesSize, + panels.memory.memoryDirectMap, + panels.memory.memoryBounce, + ] + ), + diskOverview: + g.panel.row.new('Disk') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.disk.diskIOBytesPerSec { gridPos+: { w: 12, h: 8 } }, + panels.disk.diskUsage { gridPos+: { w: 12, h: 8 } }, + ]), + disk: + g.panel.row.new('Disk') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.disk.diskIOBytesPerSec, + panels.disk.diskIOps, + panels.disk.diskIOWaitTime, + panels.disk.diskQueue, + ]), + filesystem: + g.panel.row.new('Filesystem') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.disk.diskFreeTs, + panels.disk.diskUsage, + panels.disk.diskInodesFree, + panels.disk.diskInodesTotal, + panels.disk.diskErrorsandRO, + panels.disk.fileDescriptors, + ]), + networkOverview: + g.panel.row.new('Network') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.network.networkUsagePerSec { gridPos+: { w: 12, h: 8 } }, + panels.network.networkErrorsAndDroppedPerSec { gridPos+: { w: 12, h: 8 } }, + ]), + network: + g.panel.row.new('Network') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.network.networkOverviewTable { gridPos: { w: 24 } }, + panels.network.networkUsagePerSec, + panels.network.networkOperStatus, + panels.network.networkErrorsPerSec, + panels.network.networkDroppedPerSec, + panels.network.networkPacketsPerSec, + panels.network.networkMulticastPerSec, + panels.network.networkFifo, + panels.network.networkCompressedPerSec, + panels.network.networkNFConntrack, + panels.network.networkSoftnet, + panels.network.networkSoftnetSqueeze, + ]), + networkSockets: + g.panel.row.new('Network sockets') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.network.networkSockstatAll { gridPos: { w: 24 } }, + panels.network.networkSockstatTCP, + panels.network.networkSockstatUDP, + panels.network.networkSockstatMemory, + panels.network.networkSockstatOther, + ]), + networkNetstat: + g.panel.row.new('Network netstat') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.network.networkNetstatIP { gridPos: { w: 24 } }, + panels.network.networkNetstatTCP, + panels.network.networkNetstatTCPerrors, + panels.network.networkNetstatUDP, + panels.network.networkNetstatUDPerrors, + panels.network.networkNetstatICMP, + panels.network.networkNetstatICMPerrors, + ]), + + hardware: + g.panel.row.new('Hardware') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.hardware.hardwareTemperature { gridPos+: { w: 12, h: 8 } }, + ]), + + + //use + cpuUseMethod: + g.panel.row.new('CPU') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + cpuUseClusterMethod: + self.cpuUseMethod + + g.panel.row.withPanels([ + panels.useCluster.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + cpuUseClusterMethodMulti: + self.cpuUseClusterMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + + memoryUseMethod: + g.panel.row.new('Memory') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + memoryUseClusterMethod: + self.memoryUseMethod + + g.panel.row.withPanels([ + panels.useCluster.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + memoryUseClusterMethodMulti: + self.memoryUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + + diskUseMethod: + g.panel.row.new('Disk') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + diskUseClusterMethod: + self.diskUseMethod + + g.panel.row.withPanels([ + panels.useCluster.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + diskUseClusterMethodMulti: + self.diskUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + filesystemUseMethod: + g.panel.row.new('Filesystem') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + filesystemUseClusterMethod: + self.filesystemUseMethod + + g.panel.row.withPanels([ + panels.useCluster.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + filesystemUseClusterMethodMulti: + self.filesystemUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + + networkUseMethod: + g.panel.row.new('Network') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + networkUseClusterMethod: + self.networkUseMethod + + g.panel.row.withPanels([ + panels.useCluster.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + networkUseClusterMethodMulti: + self.networkUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + }, +} diff --git a/docs/node-mixin/lib/linux/rows/main.libsonnet b/docs/node-mixin/lib/linux/rows/main.libsonnet new file mode 100644 index 0000000000..367abc254a --- /dev/null +++ b/docs/node-mixin/lib/linux/rows/main.libsonnet @@ -0,0 +1,8 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +{ + new(this):: { + linux: (import './linux.libsonnet').new(this), + use: (import './use.libsonnet').new(this), + }, +} diff --git a/docs/node-mixin/lib/linux/rows/use.libsonnet b/docs/node-mixin/lib/linux/rows/use.libsonnet new file mode 100644 index 0000000000..eb03192af1 --- /dev/null +++ b/docs/node-mixin/lib/linux/rows/use.libsonnet @@ -0,0 +1,107 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +{ + new(this): + { + local panels = this.grafana.panels, + //use + cpuUseMethod: + g.panel.row.new('CPU') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + cpuUseClusterMethod: + self.cpuUseMethod + + g.panel.row.withPanels([ + panels.useCluster.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + cpuUseClusterMethodMulti: + self.cpuUseClusterMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.cpuUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.cpuSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + + memoryUseMethod: + g.panel.row.new('Memory') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + memoryUseClusterMethod: + self.memoryUseMethod + + g.panel.row.withPanels([ + panels.useCluster.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + memoryUseClusterMethodMulti: + self.memoryUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.memoryUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.memorySaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + + diskUseMethod: + g.panel.row.new('Disk') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + diskUseClusterMethod: + self.diskUseMethod + + g.panel.row.withPanels([ + panels.useCluster.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + diskUseClusterMethodMulti: + self.diskUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.diskUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.diskSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + + filesystemUseMethod: + g.panel.row.new('Filesystem') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + filesystemUseClusterMethod: + self.filesystemUseMethod + + g.panel.row.withPanels([ + panels.useCluster.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + filesystemUseClusterMethodMulti: + self.filesystemUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.filesystemUtilization { gridPos+: { w: 24, h: 7 } }, + ]), + + networkUseMethod: + g.panel.row.new('Network') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.use.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.use.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + networkUseClusterMethod: + self.networkUseMethod + + g.panel.row.withPanels([ + panels.useCluster.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useCluster.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + networkUseClusterMethodMulti: + self.networkUseMethod + + g.panel.row.withPanels([ + panels.useClusterMulti.networkUtilization { gridPos+: { w: 12, h: 7 } }, + panels.useClusterMulti.networkSaturation { gridPos+: { w: 12, h: 7 } }, + ]), + }, +} diff --git a/docs/node-observ-lib/linux/rules.libsonnet b/docs/node-mixin/lib/linux/rules/rules.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/rules.libsonnet rename to docs/node-mixin/lib/linux/rules/rules.libsonnet diff --git a/docs/node-mixin/lib/linux/targets/alerts.libsonnet b/docs/node-mixin/lib/linux/targets/alerts.libsonnet new file mode 100644 index 0000000000..de4a23d931 --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/alerts.libsonnet @@ -0,0 +1,25 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + alertsCritical: + prometheusQuery.new( + prometheusDatasource, + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + alertsWarning: + prometheusQuery.new( + prometheusDatasource, + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/cpu.libsonnet b/docs/node-mixin/lib/linux/targets/cpu.libsonnet new file mode 100644 index 0000000000..7fd6cf2d6b --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/cpu.libsonnet @@ -0,0 +1,53 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + cpuCount: + prometheusQuery.new( + prometheusDatasource, + 'count without (cpu) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"})' % variables + ) + + prometheusQuery.withLegendFormat('Cores'), + cpuUsage: + prometheusQuery.new( + prometheusDatasource, + ||| + (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s))) + - + avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(queriesSelector)s}[$__rate_interval])))) * 100) + / + count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s)) + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ) + + prometheusQuery.withLegendFormat('CPU usage'), + cpuUsagePerCore: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(queriesSelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"}) + ) * 100 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('CPU {{cpu}}'), + cpuUsageByMode: + prometheusQuery.new( + prometheusDatasource, + ||| + sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval])) + / on(%(instanceLabels)s) + group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval]))) * 100 + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ) + + prometheusQuery.withLegendFormat('{{ mode }}'), + }, +} diff --git a/docs/node-mixin/lib/linux/targets/disk.libsonnet b/docs/node-mixin/lib/linux/targets/disk.libsonnet new file mode 100644 index 0000000000..036ecef63e --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/disk.libsonnet @@ -0,0 +1,147 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + uptimeQuery:: 'node_boot_time_seconds', + + diskTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ), + diskTotalRoot: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_size_bytes{%(queriesSelector)s, mountpoint="/", fstype!="rootfs"}' % variables, + ), + diskUsageRoot: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_avail_bytes{%(queriesSelector)s, mountpoint="/",fstype!="rootfs"}' % variables + ), + diskUsageRootPercent: + prometheusQuery.new( + prometheusDatasource, + '100 - node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}/node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}*100' % variables + ), + diskFree: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} free'), + diskUsagePercent: + prometheusQuery.new( + prometheusDatasource, + '100 - node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}/node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}*100' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} used, %'), + + diskInodesFree: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_files_free{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector }, + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes free'), + diskInodesTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_files{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes total'), + diskReadOnly: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_readonly{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} read-only'), + diskDeviceError: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_device_error{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} device error'), + // descriptors + processMaxFds: + prometheusQuery.new( + prometheusDatasource, + 'process_max_fds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Maximum open file descriptors'), + processOpenFds: + prometheusQuery.new( + prometheusDatasource, + 'process_open_fds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Open file descriptors'), + + // disk(device) + diskIOreadBytesPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_read_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} read'), + diskIOwriteBytesPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_written_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} written'), + diskIOutilization: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_io_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} io util'), + diskAvgQueueSize: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_io_time_weighted_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} avg queue'), + + diskIOWaitWriteTime: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_write_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} avg write time'), + diskIOWaitReadTime: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_read_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} avg read time'), + diskIOReads: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} reads'), + diskIOWrites: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} writes'), + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/events.libsonnet b/docs/node-mixin/lib/linux/targets/events.libsonnet new file mode 100644 index 0000000000..3e95d4a57c --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/events.libsonnet @@ -0,0 +1,61 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + uptimeQuery:: 'node_boot_time_seconds', + + reboot: + prometheusQuery.new( + prometheusDatasource, + self.uptimeQuery + '{%(queriesSelector)s}*1000 > $__from < $__to' % variables, + ), + + serviceFailed: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="init.scope"} |= "code=exited, status=1/FAILURE"' % variables + ), + // those events should be rare, so can be shown as annotations + criticalEvents: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, transport="kernel", level="emerg"}' % variables + ), + memoryOOMkiller: + prometheusQuery.new( + prometheusDatasource, + 'increase(node_vmstat_oom_kill{%(queriesSelector)s}[$__interval:] offset -$__interval)' % variables, + ) + + prometheusQuery.withLegendFormat('OOM killer invocations'), + + kernelUpdate: + prometheusQuery.new( + prometheusDatasource, + expr=||| + changes( + sum by (%(instanceLabels)s) ( + group by (%(instanceLabels)s,release) (node_uname_info{%(queriesSelector)s}) + ) + [$__interval:1m] offset -$__interval) > 1 + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + + // new interactive session in logs: + sessionOpened: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="systemd-logind.service"}|= "New session"' % variables + ), + sessionClosed: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="systemd-logind.service"} |= "logged out"' % variables + ), + }, +} diff --git a/docs/node-mixin/lib/linux/targets/hardware.libsonnet b/docs/node-mixin/lib/linux/targets/hardware.libsonnet new file mode 100644 index 0000000000..b87dc7976a --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/hardware.libsonnet @@ -0,0 +1,20 @@ +local g = import '../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + hardwareTemperature: + prometheusQuery.new( + prometheusDatasource, + 'node_hwmon_temp_celsius{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('{{chip}}/{{sensor}}'), + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/main.libsonnet b/docs/node-mixin/lib/linux/targets/main.libsonnet new file mode 100644 index 0000000000..6b30b26e36 --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/main.libsonnet @@ -0,0 +1,15 @@ +{ + new(config):: { + alerts: (import './alerts.libsonnet').new(config), + cpu: (import './cpu.libsonnet').new(config), + disk: (import './disk.libsonnet').new(config), + events: (import './events.libsonnet').new(config), + hardware: (import './hardware.libsonnet').new(config), + memory: (import './memory.libsonnet').new(config), + network: (import './network.libsonnet').new(config), + system: (import './system.libsonnet').new(config), + use: (import './use.libsonnet').new(config), + useCluster: (import './useCluster.libsonnet').new(config), + useClusterMulti: (import './useClusterMulti.libsonnet').new(config), + }, +} diff --git a/docs/node-mixin/lib/linux/targets/memory.libsonnet b/docs/node-mixin/lib/linux/targets/memory.libsonnet new file mode 100644 index 0000000000..7f06662a23 --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/memory.libsonnet @@ -0,0 +1,316 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + memoryTotalBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemTotal_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory total'), + memoryFreeBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemFree_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory free'), + memoryAvailableBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemAvailable_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory available'), + memoryCachedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Cached_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory cached'), + memoryBuffersBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Buffers_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory buffers'), + memoryUsedBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_MemTotal_bytes{%(queriesSelector)s} + - + node_memory_MemFree_bytes{%(queriesSelector)s} + - + node_memory_Buffers_bytes{%(queriesSelector)s} + - + node_memory_Cached_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + memoryUsagePercent: + prometheusQuery.new( + prometheusDatasource, + ||| + 100 - + ( + avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(queriesSelector)s}) / + avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(queriesSelector)s}) + * 100 + ) + ||| + % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + memorySwapTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SwapTotal_bytes{%(queriesSelector)s}' % variables + ), + memoryPagesIn: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgpgin{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Page-In'), + memoryPagesOut: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgpgout{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Page-Out'), + + memoryPagesSwapIn: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pswpin{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Pages swapped in'), + memoryPagesSwapOut: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pswpout{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Pages swapped out'), + + memoryPageMajorFaults: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Major page fault operations'), + memoryPageMinorFaults: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_vmstat_pgfault{%(queriesSelector)s}[$__rate_interval]) + - + irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval]) + ||| % variables, + ) + + prometheusQuery.withLegendFormat('Minor page fault operations'), + + memoryInactiveBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive'), + memoryActiveBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active'), + + memoryInactiveFile: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_file_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive_file'), + + memoryInactiveAnon: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_anon_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive_anon'), + + memoryActiveFile: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_file_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active_file'), + + memoryActiveAnon: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_anon_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active_anon'), + + memoryCommitedAs: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Committed_AS_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Commited_AS'), + memoryCommitedLimit: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_CommitLimit_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('CommitLimit'), + + memoryMappedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Mapped_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Mapped'), + memoryShmemBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Shmem_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Shmem'), + memoryShmemHugePagesBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_ShmemHugePages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('ShmemHugePages'), + memoryShmemPmdMappedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_ShmemPmdMapped_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('ShmemPmdMapped'), + memoryWriteback: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Writeback_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Writeback'), + memoryWritebackTmp: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_WritebackTmp_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('WritebackTmp'), + memoryDirty: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Dirty_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Dirty'), + + memoryVmallocChunk: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocChunk_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocChunk'), + memoryVmallocTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocTotal_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocTotal'), + memoryVmallocUsed: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocUsed_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocUsed'), + memorySlabSUnreclaim: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SUnreclaim_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('SUnreclaim'), + memorySlabSReclaimable: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SReclaimable_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('SReclaimable'), + + memoryAnonHugePages: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_AnonHugePages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('AnonHugePages'), + memoryAnonPages: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_AnonPages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('AnonPages'), + + memoryHugePages_Free: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Free{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Free'), + memoryHugePages_Rsvd: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Rsvd{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Rsvd'), + memoryHugePages_Surp: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Surp{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Surp'), + memoryHugePagesTotalSize: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Total{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Huge pages total size'), + memoryHugePagesSize: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Hugepagesize_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Huge page size'), + memoryDirectMap1G: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap1G_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap1G'), + memoryDirectMap2M: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap2M_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap2M'), + memoryDirectMap4k: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap4k_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap4k'), + memoryBounce: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Bounce_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Bounce'), + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/network.libsonnet b/docs/node-mixin/lib/linux/targets/network.libsonnet new file mode 100644 index 0000000000..715bd8552c --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/network.libsonnet @@ -0,0 +1,502 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + networkUp: + prometheusQuery.new( + prometheusDatasource, + 'node_network_up{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('{{device}}'), + networkCarrier: + prometheusQuery.new( + prometheusDatasource, + 'node_network_carrier{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('{{device}}'), + networkArpEntries: + prometheusQuery.new( + prometheusDatasource, + 'node_arp_entries{%(queriesSelector)s}' % variables, + ), + networkMtuBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_network_mtu_bytes{%(queriesSelector)s}' % variables, + ), + networkSpeedBitsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'node_network_speed_bytes{%(queriesSelector)s} * 8' % variables, + ), + networkTransmitQueueLength: + prometheusQuery.new( + prometheusDatasource, + 'node_network_transmit_queue_length{%(queriesSelector)s}' % variables, + ), + networkInfo: + prometheusQuery.new( + prometheusDatasource, + 'node_network_info{%(queriesSelector)s}' % variables, + ), + + networkOutBitPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkInBitPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutBitPerSecFiltered: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 + # only show interfaces that had traffic change at least once during selected dashboard interval: + and + increase( + node_network_transmit_bytes_total{%(queriesSelector)s}[$__range] + ) > 0 + ||| % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkInBitPerSecFiltered: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 + # only show interfaces that had traffic change at least once during selected dashboard interval: + and + increase( + node_network_receive_bytes_total{%(queriesSelector)s}[$__range] + ) > 0 + ||| % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + + + networkOutErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} errors transmitted'), + networkInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} errors received'), + networkOutDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted dropped'), + networkInDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received dropped'), + + networkInPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkInMulticastPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutMulticastPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkFifoInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkFifoOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkCompressedInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkCompressedOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkNFConntrackEntries: + prometheusQuery.new( + prometheusDatasource, + 'node_nf_conntrack_entries{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('NF conntrack entries'), + networkNFConntrackLimits: + prometheusQuery.new( + prometheusDatasource, + 'node_nf_conntrack_entries_limit{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('NF conntrack limits'), + + networkSoftnetProcessedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_processed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} processed'), + networkSoftnetDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_dropped_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} dropped'), + networkSoftnetSqueezedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_times_squeezed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} out of quota'), + + networkSocketsUsed: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_sockets_used{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 sockets in use'), + networkSocketsTCPAllocated: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_alloc{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Allocated'), + networkSocketsTCPIPv6: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 in use'), + networkSocketsTCPIPv4: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 in use'), + networkSocketsTCPOrphans: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_orphan{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Orphan sockets'), + networkSocketsTCPTimeWait: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_tw{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Time wait'), + + networkSocketsUDPLiteInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDPLITE_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 UDPLITE in use'), + networkSocketsUDPInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 UDP in use'), + networkSocketsUDPLiteIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDPLITE6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 UDPLITE in use'), + networkSocketsUDPIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 UDP in use'), + + networkSocketsFragInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_FRAG_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 Frag sockets in use'), + networkSocketsFragIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_FRAG6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 Frag sockets in use'), + networkSocketsRawInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_RAW_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 Raw sockets in use'), + networkSocketsIPv6RawInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_RAW6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 Raw sockets in use'), + + networkSocketsTCPMemoryPages: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_mem{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory pages allocated for TCP sockets'), + networkSocketsUDPMemoryPages: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_mem{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory pages allocated for UDP sockets'), + + networkSocketsTCPMemoryBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_mem_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory bytes allocated for TCP sockets'), + networkSocketsUDPMemoryBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_mem_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory bytes allocated for UDP sockets'), + + networkNetstatIPInOctetsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_IpExt_InOctets{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('Octets received'), + networkNetstatIPOutOctetsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_IpExt_OutOctets{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('Octets transmitted'), + + networkNetstatTCPInSegmentsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_InSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP received'), + networkNetstatTCPOutSegmentsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_OutSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP transmitted'), + + networkNetstatTCPOverflowPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_ListenOverflows{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP overflow'), + + networkNetstatTCPListenDropsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_ListenDrops{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP ListenDrops - SYNs to LISTEN sockets ignored'), + + networkNetstatTCPRetransPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_TCPSynRetrans{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP SYN rentransmits'), + + networkNetstatTCPRetransSegPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_RetransSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP retransmitted segments, containing one or more previously transmitted octets'), + networkNetstatTCPInWithErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_InErrs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP received with errors'), + + networkNetstatTCPOutWithRstPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_OutRsts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP segments sent with RST flag'), + + networkNetstatIPInUDPPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP received'), + + networkNetstatIPOutUDPPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP transmitted'), + + networkNetstatIPInUDP6PerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 received'), + + networkNetstatIPOutUDP6PerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 transmitted'), + + //UDP errors + networkNetstatUDPLiteInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_UdpLite_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDPLite InErrors'), + + networkNetstatUDPInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP InErrors'), + networkNetstatUDP6InErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 InErrors'), + networkNetstatUDPNoPortsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP NoPorts'), + networkNetstatUDP6NoPortsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 NoPorts'), + networkNetstatUDPRcvBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP receive buffer errors'), + networkNetstatUDP6RcvBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 receive buffer errors'), + networkNetstatUDPSndBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP transmit buffer errors'), + networkNetstatUDP6SndBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 transmit buffer errors'), + + //ICMP + networkNetstatICMPInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP received'), + networkNetstatICMPOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP transmitted'), + networkNetstatICMP6InPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 received'), + networkNetstatICMP6OutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 transmitted'), + + networkNetstatICMPInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 errors'), + networkNetstatICM6PInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 errors'), + }, +} diff --git a/docs/node-mixin/lib/linux/targets/system.libsonnet b/docs/node-mixin/lib/linux/targets/system.libsonnet new file mode 100644 index 0000000000..04f6104857 --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/system.libsonnet @@ -0,0 +1,114 @@ +local g = import '../../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + uptimeQuery:: 'node_boot_time_seconds', + uptime: + prometheusQuery.new( + prometheusDatasource, + 'time() - ' + self.uptimeQuery + '{%(queriesSelector)s}' % variables + ), + unameInfo: + prometheusQuery.new( + prometheusDatasource, + 'node_uname_info{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + osInfo: + prometheusQuery.new( + prometheusDatasource, + ||| + node_os_info{%(queriesSelector)s} + ||| % variables, + ) + + prometheusQuery.withFormat('table'), + osInfoCombined: + prometheusQuery.new( + prometheusDatasource, + ||| + node_uname_info{%(queriesSelector)s} + * on (%(groupLabels)s,%(instanceLabels)s) + group_left(pretty_name) + node_os_info{%(queriesSelector)s} + ||| % variables { + instanceLabels: std.join(',', this.config.instanceLabels), + groupLabels: std.join(',', this.config.groupLabels), + }, + ) + + prometheusQuery.withFormat('table'), + + osTimezone: //timezone label + prometheusQuery.new( + prometheusDatasource, + 'node_time_zone_offset_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + + systemLoad1: + prometheusQuery.new( + prometheusDatasource, + 'node_load1{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('1m'), + systemLoad5: + prometheusQuery.new( + prometheusDatasource, + 'node_load5{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('5m'), + systemLoad15: + prometheusQuery.new( + prometheusDatasource, + 'node_load15{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('15m'), + + systemContextSwitches: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_context_switches_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Context switches'), + + systemInterrupts: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_intr_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Interrupts'), + + timeNtpStatus: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_sync_status{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('NTP status'), + + timeOffset: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_offset_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Time offset'), + + timeEstimatedError: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_estimated_error_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Estimated error in seconds'), + timeMaxError: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_maxerror_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Maximum error in seconds'), + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/use.libsonnet b/docs/node-mixin/lib/linux/targets/use.libsonnet new file mode 100644 index 0000000000..01d497b1ba --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/use.libsonnet @@ -0,0 +1,102 @@ +local g = import '../../g.libsonnet'; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.use, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + cpuUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + instance:node_cpu_utilisation:rate5m{%(queriesSelector)s} * 100 != 0 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % xtd.array.slice(this.config.instanceLabels, -1)), + + cpuSaturation: + prometheusQuery.new( + prometheusDatasource, + ||| + instance:node_load1_per_cpu:ratio{%(queriesSelector)s} * 100 != 0 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Saturation' % xtd.array.slice(this.config.instanceLabels, -1)), + memoryUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + instance:node_memory_utilisation:ratio{%(queriesSelector)s} * 100 != 0 + ||| + % variables + ) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % xtd.array.slice(this.config.instanceLabels, -1)), + + memorySaturation: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_vmstat_pgmajfault:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Major page fault operations' % xtd.array.slice(this.config.instanceLabels, -1)), + + networkUtilizationReceive: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_receive_bytes_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % xtd.array.slice(this.config.instanceLabels, -1)), + networkUtilizationTransmit: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_transmit_bytes_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % xtd.array.slice(this.config.instanceLabels, -1)), + networkSaturationReceive: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_receive_drop_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % xtd.array.slice(this.config.instanceLabels, -1)), + networkSaturationTransmit: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_transmit_drop_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % xtd.array.slice(this.config.instanceLabels, -1)), + + diskUtilization: + prometheusQuery.new( + prometheusDatasource, + 'instance_device:node_disk_io_time_seconds:rate5m{%(queriesSelector)s} * 100 != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + diskSaturation: + prometheusQuery.new( + prometheusDatasource, + 'instance_device:node_disk_io_time_weighted_seconds:rate5m{%(queriesSelector)s} * 100 != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + + filesystemUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + sort_desc(1 - + ( + max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(queriesSelector)s, %(fsMountpointSelector)s, %(fsSelector)s}) + / + max without (mountpoint, fstype) (node_filesystem_size_bytes{%(queriesSelector)s, %(fsMountpointSelector)s, %(fsSelector)s}) + ) != 0 + ) * 100 + ||| % variables { fsMountpointSelector: this.config.fsMountpointSelector, fsSelector: this.config.fsSelector }, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/useCluster.libsonnet b/docs/node-mixin/lib/linux/targets/useCluster.libsonnet new file mode 100644 index 0000000000..82e200808d --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/useCluster.libsonnet @@ -0,0 +1,125 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'github.com/grafana/jsonnet-libs/common-lib/common/main.libsonnet'; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + + +{ + new(this): { + local variables = this.grafana.variables.useCluster, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + cpuUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + (( + instance:node_cpu_utilisation:rate5m{%(queriesSelector)s} + * + instance:node_num_cpu:sum{%(queriesSelector)s} + ) != 0 ) + / scalar(sum(instance:node_num_cpu:sum{%(queriesSelector)s})) * 100 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % xtd.array.slice(this.config.instanceLabels, -1)), + + cpuSaturation: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + instance:node_load1_per_cpu:ratio{%(queriesSelector)s} + / scalar(count(instance:node_load1_per_cpu:ratio{%(queriesSelector)s})) + ) * 100 != 0 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Saturation' % xtd.array.slice(this.config.instanceLabels, -1)), + memoryUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + instance:node_memory_utilisation:ratio{%(queriesSelector)s} + / scalar(count(instance:node_memory_utilisation:ratio{%(queriesSelector)s})) + ) * 100 != 0 + ||| + % variables + ) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % xtd.array.slice(this.config.instanceLabels, -1)), + + memorySaturation: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_vmstat_pgmajfault:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Major page fault operations' % xtd.array.slice(this.config.instanceLabels, -1)), + networkUtilizationReceive: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_receive_bytes_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % xtd.array.slice(this.config.instanceLabels, -1)), + networkUtilizationTransmit: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_transmit_bytes_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % xtd.array.slice(this.config.instanceLabels, -1)), + networkSaturationReceive: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_receive_drop_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % xtd.array.slice(this.config.instanceLabels, -1)), + networkSaturationTransmit: + prometheusQuery.new( + prometheusDatasource, + 'instance:node_network_transmit_drop_excluding_lo:rate5m{%(queriesSelector)s} != 0' % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % xtd.array.slice(this.config.instanceLabels, -1)), + + diskUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + instance_device:node_disk_io_time_seconds:rate5m{%(queriesSelector)s} + / scalar(count(instance_device:node_disk_io_time_seconds:rate5m{%(queriesSelector)s})) + ) * 100 != 0 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + diskSaturation: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + instance_device:node_disk_io_time_weighted_seconds:rate5m{%(queriesSelector)s} + / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{%(queriesSelector)s})) + ) * 100 != 0 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + + filesystemUtilization: + prometheusQuery.new( + prometheusDatasource, + ||| + sum without (device) ( + max without (fstype, mountpoint) (( + node_filesystem_size_bytes{%(queriesSelector)s, %(fsMountpointSelector)s, %(fsSelector)s} + - + node_filesystem_avail_bytes{%(queriesSelector)s, %(fsMountpointSelector)s, %(fsSelector)s} + ) != 0) + ) + / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(queriesSelector)s, %(fsMountpointSelector)s, %(fsSelector)s}))) * 100 + ||| % variables { fsMountpointSelector: this.config.fsMountpointSelector, fsSelector: this.config.fsSelector }, + ) + + prometheusQuery.withLegendFormat('{{%s}}: {{device}}' % xtd.array.slice(this.config.instanceLabels, -1)), + + + }, +} diff --git a/docs/node-mixin/lib/linux/targets/useClusterMulti.libsonnet b/docs/node-mixin/lib/linux/targets/useClusterMulti.libsonnet new file mode 100644 index 0000000000..0ea9e74420 --- /dev/null +++ b/docs/node-mixin/lib/linux/targets/useClusterMulti.libsonnet @@ -0,0 +1,64 @@ +local g = import '../../g.libsonnet'; +local commonlib = import 'github.com/grafana/jsonnet-libs/common-lib/common/main.libsonnet'; +local prometheusQuery = g.query.prometheus; + + +{ + new(this): { + local variables = this.grafana.variables.useCluster, + local config = this.config, + local baseTargets = this.grafana.targets.useCluster, + cpuUtilization: + baseTargets.cpuUtilization + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.cpuUtilization.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % this.config.clusterLabel), + + cpuSaturation: + baseTargets.cpuSaturation + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.cpuSaturation.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Saturation' % this.config.clusterLabel), + + memoryUtilization: + baseTargets.memoryUtilization + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.memoryUtilization.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Utilization' % this.config.clusterLabel), + + memorySaturation: + baseTargets.memorySaturation + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.memorySaturation.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Major page fault operations' % this.config.clusterLabel), + networkUtilizationReceive: + baseTargets.networkUtilizationReceive + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.networkUtilizationReceive.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % this.config.clusterLabel), + + networkUtilizationTransmit: + baseTargets.networkUtilizationTransmit + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.networkUtilizationTransmit.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % this.config.clusterLabel), + networkSaturationReceive: + baseTargets.networkSaturationReceive + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.networkSaturationReceive.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Receive' % this.config.clusterLabel), + networkSaturationTransmit: + baseTargets.networkSaturationReceive + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.networkSaturationReceive.expr]) + + prometheusQuery.withLegendFormat('{{%s}}: Transmit' % this.config.clusterLabel), + + diskUtilization: + baseTargets.diskUtilization + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.diskUtilization.expr]) + + prometheusQuery.withLegendFormat('{{%s}}' % this.config.clusterLabel), + diskSaturation: + baseTargets.diskSaturation + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.diskSaturation.expr]) + + prometheusQuery.withLegendFormat('{{%s}}' % this.config.clusterLabel), + + filesystemUtilization: + baseTargets.filesystemUtilization + + prometheusQuery.withExpr('sum by (%s) (%s)' % [this.config.clusterLabel, baseTargets.filesystemUtilization.expr]) + + prometheusQuery.withLegendFormat('{{%s}}' % this.config.clusterLabel), + + + }, +} diff --git a/docs/node-mixin/lib/linux/variables.libsonnet b/docs/node-mixin/lib/linux/variables.libsonnet new file mode 100644 index 0000000000..6f88bf3182 --- /dev/null +++ b/docs/node-mixin/lib/linux/variables.libsonnet @@ -0,0 +1,43 @@ +// variables.libsonnet +local g = import '../g.libsonnet'; +local var = g.dashboard.variable; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + new( + this + ): + { + main: + commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='node_uname_info', + customAllValue=this.config.customAllValue, + enableLokiLogs=this.config.enableLokiLogs, + ), + // used in USE cluster dashboard + use: + commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + // drop clusterLabel from groupLabels: + groupLabels=std.uniq(this.config.groupLabels + [this.config.clusterLabel]), + instanceLabels=this.config.instanceLabels, + varMetric='instance:node_cpu_utilisation:rate5m', + customAllValue=this.config.customAllValue, + enableLokiLogs=this.config.enableLokiLogs, + ), + useCluster: + commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=std.uniq(this.config.groupLabels + [this.config.clusterLabel]), + instanceLabels=[], + varMetric='instance:node_cpu_utilisation:rate5m', + customAllValue=this.config.customAllValue, + enableLokiLogs=this.config.enableLokiLogs, + ), + }, + +} diff --git a/docs/node-observ-lib/macos/README.md b/docs/node-mixin/lib/macos/README.md similarity index 91% rename from docs/node-observ-lib/macos/README.md rename to docs/node-mixin/lib/macos/README.md index 815903ffc1..edc94a0d99 100644 --- a/docs/node-observ-lib/macos/README.md +++ b/docs/node-mixin/lib/macos/README.md @@ -6,7 +6,7 @@ This jsonnet observability lib can be used to generate observability package for ```sh jb init -jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +jb install https://github.com/grafana/node_exporter/docs/node-mixin/lib/macos ``` ## Examples @@ -17,7 +17,7 @@ You can use observ-lib to fill in monitoring-mixin structure: ```jsonnet // mixin.libsonnet file -local macoslib = import 'node-observ-lib/macos/main.libsonnet'; +local macoslib = import 'macos/main.libsonnet'; local mac = macoslib.new() @@ -39,7 +39,7 @@ local mac = } ``` -For more examples see [node-observ-lib/linux](../linux). +For more examples see [node-mixin/lib/linux](../linux). ## Collectors used: diff --git a/docs/node-observ-lib/macos/alerts.libsonnet b/docs/node-mixin/lib/macos/alerts.libsonnet similarity index 100% rename from docs/node-observ-lib/macos/alerts.libsonnet rename to docs/node-mixin/lib/macos/alerts.libsonnet diff --git a/docs/node-mixin/lib/macos/config.libsonnet b/docs/node-mixin/lib/macos/config.libsonnet new file mode 100644 index 0000000000..370ab4d052 --- /dev/null +++ b/docs/node-mixin/lib/macos/config.libsonnet @@ -0,0 +1,29 @@ +{ + // Rest of the config is imported from linux + filteringSelector: 'job=macos"', + dashboardNamePrefix: 'MacOS / ', + //uid prefix + uid: 'darwin', + + dashboardTags: ['macos-mixin'], + + + // Alerts to keep from node-observ-lib: + alertsMacKeep: [ + 'NodeFilesystemAlmostOutOfSpace', + 'NodeNetworkReceiveErrs', + 'NodeNetworkTransmitErrs', + 'NodeTextFileCollectorScrapeError', + 'NodeFilesystemFilesFillingUp', + 'NodeFilesystemAlmostOutOfFiles', + ], + // logs lib related + enableLokiLogs: false, + extraLogLabels: ['filename', 'sender'], + logsVolumeGroupBy: 'sender', + showLogsVolume: true, + logsFilteringSelector: self.filteringSelector, + logsExtraFilters: '', + + +} diff --git a/docs/node-observ-lib/macos/main.libsonnet b/docs/node-mixin/lib/macos/main.libsonnet similarity index 78% rename from docs/node-observ-lib/macos/main.libsonnet rename to docs/node-mixin/lib/macos/main.libsonnet index ca898f9b3e..6b15d25453 100644 --- a/docs/node-observ-lib/macos/main.libsonnet +++ b/docs/node-mixin/lib/macos/main.libsonnet @@ -36,15 +36,20 @@ nodelib // keep only overview and logs(optionally) dashes dashboards: { - overview: parentGrafana.dashboards.overview, + 'nodes-darwin.json': + parentGrafana.dashboards['nodes.json'] + + g.dashboard.withUid( + (if this.config.uid == 'darwin' then std.md5('nodes-darwin.json') else this.config.uid + '-overview') + ), } + ( if this.config.enableLokiLogs then { - logs: parentGrafana.dashboards.logs, + 'logs-darwin.json': parentGrafana.dashboards['logs.json'], } + else {} ), }, prometheus+: { diff --git a/docs/node-mixin/lib/macos/panels.libsonnet b/docs/node-mixin/lib/macos/panels.libsonnet new file mode 100644 index 0000000000..454d769888 --- /dev/null +++ b/docs/node-mixin/lib/macos/panels.libsonnet @@ -0,0 +1,42 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + + // override description and targets + memory+: { + memoryUsageTsBytes+: + g.panel.timeSeries.panelOptions.withDescription( + ||| + - Physical memory: Total amount of memory installed in this computer; + - App memory: Physical memory allocated by apps and system processes; + - Wired memory: Physical memory, containing data that cannot be compressed or swapped to disk; + - Compressed memory: Physical memory used to store a compressed version of data that has not been used recently; + - Swap used: Amount of compressed data temporarily moved to disk to make room in memory for more recently used data. + ||| + ) + + g.panel.timeSeries.queryOptions.withTargets([ + t.memory.memoryUsedBytes, + t.memory.memoryTotalBytes, + t.memory.memoryAppBytes, + t.memory.memoryWiredBytes, + t.memory.memoryCompressedBytes, + t.memory.memorySwapUsedBytes, + ]) + + commonlib.panels.generic.timeSeries.threshold.stylizeByRegexp('Physical memory'), + }, + + //override reduceOption field to version + system+: { + osInfo+: + g.panel.timeSeries.panelOptions.withTitle('OS version') + + { options+: { reduceOptions: { fields: '/^version$/' } } }, + }, + }, +} diff --git a/docs/node-mixin/lib/macos/targets.libsonnet b/docs/node-mixin/lib/macos/targets.libsonnet new file mode 100644 index 0000000000..7433b3974d --- /dev/null +++ b/docs/node-mixin/lib/macos/targets.libsonnet @@ -0,0 +1,90 @@ +local g = import '../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables.main, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + // override memory targets (other metrics in macos) + memory+: { + memoryTotalBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_total_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Physical memory'), + + memoryUsedBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_internal_bytes{%(queriesSelector)s} - + node_memory_purgeable_bytes{%(queriesSelector)s} + + node_memory_wired_bytes{%(queriesSelector)s} + + node_memory_compressed_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + memoryAppBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_internal_bytes{%(queriesSelector)s} - + node_memory_purgeable_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('App memory'), + memoryWiredBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_wired_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Wired memory'), + memoryCompressedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_compressed_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Compressed memory'), + + memoryUsagePercent: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + ( + avg(node_memory_internal_bytes{%(queriesSelector)s}) - + avg(node_memory_purgeable_bytes{%(queriesSelector)s}) + + avg(node_memory_wired_bytes{%(queriesSelector)s}) + + avg(node_memory_compressed_bytes{%(queriesSelector)s}) + ) / + avg(node_memory_total_bytes{%(queriesSelector)s}) + ) + * + 100 + ||| + % variables, + ), + memorySwapTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_total_bytes{%(queriesSelector)s}' % variables + ), + + memorySwapUsedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_used_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Swap used'), + }, + }, +} diff --git a/docs/node-mixin/lib/panels/common/info.libsonnet b/docs/node-mixin/lib/panels/common/info.libsonnet deleted file mode 100644 index 3b54d39382..0000000000 --- a/docs/node-mixin/lib/panels/common/info.libsonnet +++ /dev/null @@ -1,30 +0,0 @@ -// Info panel text (number or text) -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withColor(color='text') - + self.withTextSize(value=20) - + self.withGraphMode('none') - + - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - graphMode: 'none', - }, - }, -} diff --git a/docs/node-mixin/lib/panels/common/networktraffic.libsonnet b/docs/node-mixin/lib/panels/common/networktraffic.libsonnet deleted file mode 100644 index 09f3370f67..0000000000 --- a/docs/node-mixin/lib/panels/common/networktraffic.libsonnet +++ /dev/null @@ -1,18 +0,0 @@ -// Panels to graph network traffic in and out -local timeseries = import '../timeseries.libsonnet'; -timeseries { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withUnits('bps') - + self.withNegativeYByRegex('transmit|tx|out') - + self.withAxisLabel('out(-) | in(+)'), -} diff --git a/docs/node-mixin/lib/panels/common/panels.libsonnet b/docs/node-mixin/lib/panels/common/panels.libsonnet deleted file mode 100644 index 88fea17a6e..0000000000 --- a/docs/node-mixin/lib/panels/common/panels.libsonnet +++ /dev/null @@ -1,6 +0,0 @@ -{ - uptimeStat:: import 'uptime.libsonnet', - infoStat:: import 'info.libsonnet', - percentUsageStat:: import 'percentusage.libsonnet', - networkTrafficGraph:: import 'networktraffic.libsonnet', -} diff --git a/docs/node-mixin/lib/panels/common/percentusage.libsonnet b/docs/node-mixin/lib/panels/common/percentusage.libsonnet deleted file mode 100644 index 884878f673..0000000000 --- a/docs/node-mixin/lib/panels/common/percentusage.libsonnet +++ /dev/null @@ -1,30 +0,0 @@ -// Panels to display metrics that can go from 0 to 100%. (cpu utilization, memory utilization etc). Full utilization is considered an issue. -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withUnits('percent') - + self.withMax(100) - + self.withMin(0) - + self.withColor(mode='continuous-BlYlRd') - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - }, - }, -} diff --git a/docs/node-mixin/lib/panels/common/uptime.libsonnet b/docs/node-mixin/lib/panels/common/uptime.libsonnet deleted file mode 100644 index a64a179faa..0000000000 --- a/docs/node-mixin/lib/panels/common/uptime.libsonnet +++ /dev/null @@ -1,43 +0,0 @@ -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title='Uptime', - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withGraphMode('none') - + self.withTextSize(value=20) - + self.withUnits('dtdurations') - + self.withThresholds( - mode='absolute', - steps=[ - { - color: 'orange', - value: null, - }, - { - color: 'text', - value: 300, - }, - ] - ) - + self.withColor(mode='thresholds') - + - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - }, - }, -} diff --git a/docs/node-mixin/lib/panels/panel.libsonnet b/docs/node-mixin/lib/panels/panel.libsonnet deleted file mode 100644 index 8ede6ffe87..0000000000 --- a/docs/node-mixin/lib/panels/panel.libsonnet +++ /dev/null @@ -1,129 +0,0 @@ -// generic grafana dashboard -{ - //feed grafonnet panel - new():: {}, - - withUnits(unit):: self { - - fieldConfig+: { - defaults+: { - unit: unit, - }, - }, - }, - - withLegend(show=true, mode='table', placement='bottom', calcs=['min', 'mean', 'max', 'lastNotNull']):: self { - options+: { - legend: { - showLegend: show, - displayMode: mode, - placement: placement, - calcs: calcs, - }, - }, - }, - withDecimals(decimals):: self { - - fieldConfig+: { - defaults+: { - decimals: decimals, - }, - }, - }, - - withThresholds(mode='absolute', steps=null):: self { - - fieldConfig+: { - defaults+: { - thresholds: { - mode: mode, - steps: steps, - }, - }, - }, - }, - withMin(value):: self { - fieldConfig+: { - defaults+: { - min: value, - }, - }, - }, - withMax(value):: self { - fieldConfig+: { - defaults+: { - max: value, - }, - }, - }, - withColor(color=null, mode='fixed'):: self { - fieldConfig+: { - defaults+: { - color: { - mode: mode, - fixedColor: if mode == 'fixed' then color else null, - }, - }, - }, - }, - withMaxDataPoints(value):: self { - maxDataPoints: value, - }, - withTransform():: self { - - merge():: self - { - transformations+: [ - { - id: 'merge', - options: {}, - }, - ], - }, - filterFieldsByName(pattern=null):: self - { - transformations+: [ - { - id: 'filterFieldsByName', - options: { - include: { - pattern: pattern, - }, - }, - }, - ], - }, - joinByField( - mode='outer', - field=null - ):: self { - transformations+: [ - { - id: 'joinByField', - options: { - byField: field, - mode: mode, - }, - }, - ], - }, - organize( - excludeByName={}, - indexByName={}, - renameByName={}, - - ):: self - { - transformations+: [ - { - id: 'organize', - options: { - excludeByName: excludeByName, - indexByName: indexByName, - renameByName: renameByName, - }, - }, - ], - }, - }, -} diff --git a/docs/node-mixin/lib/panels/panels.libsonnet b/docs/node-mixin/lib/panels/panels.libsonnet deleted file mode 100644 index 19c9a1d896..0000000000 --- a/docs/node-mixin/lib/panels/panels.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -{ - timeseries:: import 'timeseries.libsonnet', - stat:: import 'stat.libsonnet', - table:: import 'table.libsonnet', -} diff --git a/docs/node-mixin/lib/panels/stat.libsonnet b/docs/node-mixin/lib/panels/stat.libsonnet deleted file mode 100644 index e3fa4172f3..0000000000 --- a/docs/node-mixin/lib/panels/stat.libsonnet +++ /dev/null @@ -1,28 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -genericPanel { - new( - title=null, - description=null, - datasource=null, - ):: self + - grafana.statPanel.new( - title=title, - description=description, - datasource=datasource, - ), - withGraphMode(mode='none'):: self { - options+: - { - graphMode: mode, - }, - }, - withTextSize(value='auto', title='auto'):: self { - options+: - { text: { - valueSize: value, - titleSize: title, - } }, - }, - -} diff --git a/docs/node-mixin/lib/panels/table.libsonnet b/docs/node-mixin/lib/panels/table.libsonnet deleted file mode 100644 index 4a9c36cc66..0000000000 --- a/docs/node-mixin/lib/panels/table.libsonnet +++ /dev/null @@ -1,37 +0,0 @@ -local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -local table = grafana70.panel.table; -genericPanel -{ - new( - title=null, - description=null, - datasource=null, - ):: self + - table.new( - title=title, - description=description, - datasource=datasource, - ), - sortBy(field, desc=false):: self { - options+: { - sortBy: [ - { - displayName: field, - desc: desc, - }, - ], - }, - }, - withFooter(reducer=['mean'], fields=[]):: self { - - options+: { - footer: { - show: true, - reducer: reducer, - fields: fields, - }, - }, - }, - -} diff --git a/docs/node-mixin/lib/panels/timeseries.libsonnet b/docs/node-mixin/lib/panels/timeseries.libsonnet deleted file mode 100644 index 816ec49ad0..0000000000 --- a/docs/node-mixin/lib/panels/timeseries.libsonnet +++ /dev/null @@ -1,145 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -genericPanel -{ - new( - title=null, - description=null, - datasource=null, - ):: self + - grafana.graphPanel.new( - title=title, - description=description, - datasource=datasource, - ) - + - { - type: 'timeseries', - } - + self.withFillOpacity(10) - + self.withGradientMode('opacity') - + self.withLineInterpolation('smooth') - + self.withShowPoints('never') - + self.withTooltip(mode='multi', sort='desc') - + self.withLegend(mode='list', calcs=[]), - withDrawStyle(style):: self { - fieldConfig+: { - defaults+: { - custom+: { - drawStyle: style, - }, - }, - }, - }, - withPointsSize(size):: self { - fieldConfig+: { - defaults+: { - custom+: { - pointSize: size, - }, - }, - }, - }, - withTooltip(mode=null, sort='none'):: self { - options+: { - tooltip: { - mode: 'multi', - sort: sort, - }, - }, - }, - withLineInterpolation(value):: self { - fieldConfig+: { - defaults+: { - custom+: { - lineInterpolation: value, - }, - }, - }, - }, - withShowPoints(value):: self { - fieldConfig+: { - defaults+: { - custom+: { - showPoints: value, - }, - }, - }, - }, - withStacking(stack='normal'):: self { - fieldConfig+: { - defaults+: { - custom+: { - stacking: { - mode: stack, - group: 'A', - }, - }, - }, - }, - }, - withGradientMode(mode):: self { - fieldConfig+: { - defaults+: { - custom+: { - gradientMode: mode, - }, - }, - }, - }, - addDataLink(title, url):: self { - - fieldConfig+: { - defaults+: { - links: [ - { - title: title, - url: url, - }, - ], - }, - }, - }, - withFillOpacity(opacity):: self { - fieldConfig+: { - defaults+: { - custom+: { - fillOpacity: opacity, - }, - }, - }, - - }, - - withAxisLabel(label):: self { - fieldConfig+: { - defaults+: { - custom+: { - axisLabel: label, - }, - }, - }, - }, - - withNegativeYByRegex(regex):: self { - fieldConfig+: { - overrides+: [ - { - matcher: { - id: 'byRegexp', - options: '/' + regex + '/', - }, - properties: [ - { - id: 'custom.transform', - value: 'negative-Y', - }, - ], - }, - - ], - }, - - - }, -} diff --git a/docs/node-mixin/mixin-aix.libsonnet b/docs/node-mixin/mixin-aix.libsonnet new file mode 100644 index 0000000000..dd86f36809 --- /dev/null +++ b/docs/node-mixin/mixin-aix.libsonnet @@ -0,0 +1,10 @@ +local aixlib = import './lib/aix/main.libsonnet'; +{ + _config:: {}, + _aixLib:: + aixlib.new() + + aixlib.withConfigMixin(self._config), + grafanaDashboards+:: self._aixLib.grafana.dashboards, + prometheusAlerts+:: self._aixLib.prometheus.alerts, + prometheusRules+:: self._aixLib.prometheus.recordingRules, +} diff --git a/docs/node-mixin/mixin-mac.libsonnet b/docs/node-mixin/mixin-mac.libsonnet new file mode 100644 index 0000000000..66f2ec8bbf --- /dev/null +++ b/docs/node-mixin/mixin-mac.libsonnet @@ -0,0 +1,10 @@ +local macoslib = import './lib/macos/main.libsonnet'; +{ + _config:: {}, + _macosLib:: + macoslib.new() + + macoslib.withConfigMixin(self._config), + grafanaDashboards+:: self._macosLib.grafana.dashboards, + prometheusAlerts+:: self._macosLib.prometheus.alerts, + prometheusRules+:: self._macosLib.prometheus.recordingRules, +} diff --git a/docs/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet index b9831f9380..1659faf9d3 100644 --- a/docs/node-mixin/mixin.libsonnet +++ b/docs/node-mixin/mixin.libsonnet @@ -1,4 +1,11 @@ -(import 'config.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'dashboards/dashboards.libsonnet') + -(import 'rules/rules.libsonnet') +local nodelib = import './lib/linux/main.libsonnet'; + +{ + _config:: {}, + _linuxLib:: + nodelib.new() + + nodelib.withConfigMixin(self._config), + grafanaDashboards+:: self._linuxLib.grafana.dashboards, + prometheusAlerts+:: self._linuxLib.prometheus.alerts, + prometheusRules+:: self._linuxLib.prometheus.recordingRules, +} diff --git a/docs/node-mixin/rules.jsonnet b/docs/node-mixin/rules.jsonnet deleted file mode 100644 index dbe13f417b..0000000000 --- a/docs/node-mixin/rules.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet deleted file mode 100644 index 9c8eb90dd1..0000000000 --- a/docs/node-mixin/rules/rules.libsonnet +++ /dev/null @@ -1,119 +0,0 @@ -{ - prometheusRules+:: { - groups+: [ - { - name: 'node-exporter.rules', - rules: [ - { - // This rule gives the number of CPUs per node. - record: 'instance:node_num_cpu:sum', - expr: ||| - count without (cpu, mode) ( - node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"} - ) - ||| % $._config, - }, - { - // CPU utilisation is % CPU without {idle,iowait,steal}. - record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % $._config, - expr: ||| - 1 - avg without (cpu) ( - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal"}[%(rateInterval)s])) - ) - ||| % $._config, - }, - { - // This is CPU saturation: 1min avg run queue length / number of CPUs. - // Can go over 1. - // TODO: There are situation where a run queue >1/core is just normal and fine. - // We need to clarify how to read this metric and if its usage is helpful at all. - record: 'instance:node_load1_per_cpu:ratio', - expr: ||| - ( - node_load1{%(nodeExporterSelector)s} - / - instance:node_num_cpu:sum{%(nodeExporterSelector)s} - ) - ||| % $._config, - }, - { - // Memory utilisation (ratio of used memory per instance). - record: 'instance:node_memory_utilisation:ratio', - expr: ||| - 1 - ( - ( - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} - or - ( - node_memory_Buffers_bytes{%(nodeExporterSelector)s} - + - node_memory_Cached_bytes{%(nodeExporterSelector)s} - + - node_memory_MemFree_bytes{%(nodeExporterSelector)s} - + - node_memory_Slab_bytes{%(nodeExporterSelector)s} - ) - ) - / - node_memory_MemTotal_bytes{%(nodeExporterSelector)s} - ) - ||| % $._config, - }, - { - record: 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - // Disk utilisation (seconds spent, 1 second rate). - record: 'instance_device:node_disk_io_time_seconds:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - // Disk saturation (weighted seconds spent, 1 second rate). - record: 'instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - record: 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - { - record: 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - // TODO: Find out if those drops ever happen on modern switched networks. - { - record: 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - { - record: 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - ], - }, - ], - }, -} diff --git a/docs/node-observ-lib/.gitignore b/docs/node-observ-lib/.gitignore deleted file mode 100644 index f9bf6ba815..0000000000 --- a/docs/node-observ-lib/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -jsonnetfile.lock.json -vendor diff --git a/docs/node-observ-lib/g.libsonnet b/docs/node-observ-lib/g.libsonnet deleted file mode 100644 index 6da9f4eef9..0000000000 --- a/docs/node-observ-lib/g.libsonnet +++ /dev/null @@ -1 +0,0 @@ -import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet' diff --git a/docs/node-observ-lib/jsonnetfile.json b/docs/node-observ-lib/jsonnetfile.json deleted file mode 100644 index b12b5dc0af..0000000000 --- a/docs/node-observ-lib/jsonnetfile.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet.git", - "subdir": "gen/grafonnet-v10.0.0" - } - }, - "version": "main" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "common-lib" - } - }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "logs-lib" - } - }, - "version": "master" - } - ], - "legacyImports": true -} \ No newline at end of file diff --git a/docs/node-observ-lib/linux/dashboards.libsonnet b/docs/node-observ-lib/linux/dashboards.libsonnet deleted file mode 100644 index a76bf73908..0000000000 --- a/docs/node-observ-lib/linux/dashboards.libsonnet +++ /dev/null @@ -1,233 +0,0 @@ -local g = import '../g.libsonnet'; -local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; -{ - local root = self, - new(this): - local prefix = this.config.dashboardNamePrefix; - local links = this.grafana.links; - local tags = this.config.dashboardTags; - local uid = g.util.string.slugify(this.config.uid); - local vars = this.grafana.variables; - local annotations = this.grafana.annotations; - local refresh = this.config.dashboardRefresh; - local period = this.config.dashboardPeriod; - local timezone = this.config.dashboardTimezone; - local panels = this.grafana.panels; - local stat = g.panel.stat; - { - fleet: - local title = prefix + 'fleet overview'; - g.dashboard.new(title) - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - // g.panel.row.new("Overview"), - panels.fleetOverviewTable { gridPos+: { w: 24, h: 16 } }, - panels.cpuUsageTopk { gridPos+: { w: 24 } }, - panels.memotyUsageTopKPercent { gridPos+: { w: 24 } }, - panels.diskIOutilPercentTopK { gridPos+: { w: 12 } }, - panels.diskUsagePercentTopK { gridPos+: { w: 12 } }, - panels.networkErrorsAndDroppedPerSecTopK { gridPos+: { w: 24 } }, - ], 12, 7 - ) - ) - // hide link to self - + root.applyCommon(vars.multiInstance, uid + '-fleet', tags, links { backToFleet+:: {}, backToOverview+:: {} }, annotations, timezone, refresh, period), - overview: - g.dashboard.new(prefix + 'overview') - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - g.panel.row.new('Overview'), - panels.uptime, - panels.hostname, - panels.kernelVersion, - panels.osInfo, - panels.cpuCount, - panels.memoryTotalBytes, - panels.memorySwapTotalBytes, - panels.diskTotalRoot, - g.panel.row.new('CPU'), - panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, - panels.cpuUsageTsPerCore { gridPos+: { w: 12, h: 6 } }, - panels.systemLoad { gridPos+: { w: 6, h: 6 } }, - g.panel.row.new('Memory'), - panels.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, - panels.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, - g.panel.row.new('Disk'), - panels.diskIOBytesPerSec { gridPos+: { w: 12, h: 8 } }, - panels.diskUsage { gridPos+: { w: 12, h: 8 } }, - g.panel.row.new('Network'), - panels.networkUsagePerSec { gridPos+: { w: 12, h: 8 } }, - panels.networkErrorsAndDroppedPerSec { gridPos+: { w: 12, h: 8 } }, - - ] - + - if this.config.enableHardware then - [ - g.panel.row.new('Hardware'), - panels.hardwareTemperature { gridPos+: { w: 12, h: 8 } }, - ] else [] - , 6, 2 - ) - ) - // defaults to uid=nodes for backward compatibility with old node-mixins - + root.applyCommon(vars.singleInstance, (if uid == 'node' then 'nodes' else uid + '-overview'), tags, links { backToOverview+:: {} }, annotations, timezone, refresh, period), - network: - g.dashboard.new(prefix + 'network') - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - g.panel.row.new('Network'), - panels.networkOverviewTable { gridPos: { w: 24 } }, - panels.networkUsagePerSec, - panels.networkOperStatus, - panels.networkErrorsPerSec, - panels.networkDroppedPerSec, - panels.networkPacketsPerSec, - panels.networkMulticastPerSec, - panels.networkFifo, - panels.networkCompressedPerSec, - panels.networkNFConntrack, - panels.networkSoftnet, - panels.networkSoftnetSqueeze, - g.panel.row.new('Network sockets'), - panels.networkSockstatAll { gridPos: { w: 24 } }, - panels.networkSockstatTCP, - panels.networkSockstatUDP, - panels.networkSockstatMemory, - panels.networkSockstatOther, - g.panel.row.new('Network netstat'), - panels.networkNetstatIP { gridPos: { w: 24 } }, - panels.networkNetstatTCP, - panels.networkNetstatTCPerrors, - panels.networkNetstatUDP, - panels.networkNetstatUDPerrors, - panels.networkNetstatICMP, - panels.networkNetstatICMPerrors, - ], 12, 8 - ) - ) - + root.applyCommon(vars.singleInstance, uid + '-network', tags, links, annotations, timezone, refresh, period), - memory: - g.dashboard.new(prefix + 'memory') - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - panels.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, - panels.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, - g.panel.row.new('Vmstat'), - panels.memoryPagesInOut, - panels.memoryPagesSwapInOut, - panels.memoryPagesFaults, - panels.memoryOOMkiller, - g.panel.row.new('Memstat'), - panels.memoryActiveInactive, - panels.memoryActiveInactiveDetail, - panels.memoryCommited, - panels.memorySharedAndMapped, - panels.memoryWriteAndDirty, - panels.memoryVmalloc, - panels.memorySlab, - panels.memoryAnonymous, - panels.memoryHugePagesCounter, - panels.memoryHugePagesSize, - panels.memoryDirectMap, - panels.memoryBounce, - ], 12, 8 - ) - ) - + root.applyCommon(vars.singleInstance, uid + '-memory', tags, links, annotations, timezone, refresh, period), - - system: - g.dashboard.new(prefix + 'CPU and system') - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - g.panel.row.new('System'), - panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, - panels.cpuUsageTsPerCore { gridPos+: { w: 9, h: 6 } }, - panels.cpuUsageByMode { gridPos+: { w: 9, h: 6 } }, - panels.systemLoad, - panels.systemContextSwitchesAndInterrupts, - g.panel.row.new('Time'), - panels.osTimezone { gridPos+: { w: 3, h: 4 } }, - panels.timeNtpStatus { gridPos+: { x: 0, y: 0, w: 21, h: 4 } }, - panels.timeSyncDrift { gridPos+: { w: 24, h: 7 } }, - ], 12, 7 - ) - ) - + root.applyCommon(vars.singleInstance, uid + '-system', tags, links, annotations, timezone, refresh, period), - - disks: - g.dashboard.new(prefix + 'filesystem and disks') - + g.dashboard.withPanels( - g.util.grid.wrapPanels( - [ - g.panel.row.new('Filesystem'), - panels.diskFreeTs, - panels.diskUsage, - panels.diskInodesFree, - panels.diskInodesTotal, - panels.diskErrorsandRO, - panels.fileDescriptors, - g.panel.row.new('Disk'), - panels.diskIOBytesPerSec, - panels.diskIOps, - panels.diskIOWaitTime, - panels.diskQueue, - ], 12, 8 - ) - ) - + root.applyCommon(vars.singleInstance, uid + '-disk', tags, links, annotations, timezone, refresh, period), - } - + - if this.config.enableLokiLogs - then - { - logs: - logslib.new( - prefix + 'logs', - datasourceName=this.grafana.variables.datasources.loki.name, - datasourceRegex=this.grafana.variables.datasources.loki.regex, - filterSelector=this.config.logsFilteringSelector, - labels=this.config.groupLabels + this.config.instanceLabels + this.config.extraLogLabels, - formatParser=null, - showLogsVolume=this.config.showLogsVolume, - logsVolumeGroupBy=this.config.logsVolumeGroupBy, - extraFilters=this.config.logsExtraFilters - ) - { - dashboards+: - { - logs+: - // reference to self, already generated variables, to keep them, but apply other common data in applyCommon - root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links, annotations=annotations, timezone=timezone, refresh=refresh, period=period), - }, - panels+: - { - // modify log panel - logs+: - g.panel.logs.options.withEnableLogDetails(true) - + g.panel.logs.options.withShowTime(false) - + g.panel.logs.options.withWrapLogMessage(false), - }, - variables+: { - // add prometheus datasource for annotations processing - toArray+: [ - this.grafana.variables.datasources.prometheus { hide: 2 }, - ], - }, - }.dashboards.logs, - } - else {}, - applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): - g.dashboard.withTags(tags) - + g.dashboard.withUid(uid) - + g.dashboard.withLinks(std.objectValues(links)) - + g.dashboard.withTimezone(timezone) - + g.dashboard.withRefresh(refresh) - + g.dashboard.time.withFrom(period) - + g.dashboard.withVariables(vars) - + g.dashboard.withAnnotations(std.objectValues(annotations)), -} diff --git a/docs/node-observ-lib/linux/main.libsonnet b/docs/node-observ-lib/linux/main.libsonnet deleted file mode 100644 index 0fb13f70d2..0000000000 --- a/docs/node-observ-lib/linux/main.libsonnet +++ /dev/null @@ -1,39 +0,0 @@ -local alerts = import './alerts.libsonnet'; -local annotations = import './annotations.libsonnet'; -local config = import './config.libsonnet'; -local dashboards = import './dashboards.libsonnet'; -local datasources = import './datasources.libsonnet'; -local g = import './g.libsonnet'; -local links = import './links.libsonnet'; -local panels = import './panels.libsonnet'; -local rules = import './rules.libsonnet'; -local targets = import './targets.libsonnet'; -local variables = import './variables.libsonnet'; -local commonlib = import 'common-lib/common/main.libsonnet'; - -{ - withConfigMixin(config): { - config+: config, - }, - - new(): { - - local this = self, - config: config, - grafana: { - variables: variables.new(this), - targets: targets.new(this), - panels: panels.new(this), - annotations: annotations.new(this), - // common links here used across all dashboards - links: links.new(this), - dashboards: dashboards.new(this), - }, - - prometheus: { - alerts: alerts.new(this), - recordingRules: rules.new(this), - }, - - }, -} diff --git a/docs/node-observ-lib/linux/panels.libsonnet b/docs/node-observ-lib/linux/panels.libsonnet deleted file mode 100644 index cf185ca53b..0000000000 --- a/docs/node-observ-lib/linux/panels.libsonnet +++ /dev/null @@ -1,1148 +0,0 @@ -local g = import '../g.libsonnet'; -local commonlib = import 'common-lib/common/main.libsonnet'; -local utils = commonlib.utils; -{ - new(this): - { - local t = this.grafana.targets, - local table = g.panel.table, - local fieldOverride = g.panel.table.fieldOverride, - local instanceLabel = this.config.instanceLabels[0], - fleetOverviewTable: - commonlib.panels.generic.table.base.new( - 'Fleet overview', - targets= - [ - t.osInfoCombined - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('OS Info'), - t.uptime - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Uptime'), - t.systemLoad1 - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Load 1'), - t.cpuCount - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Cores'), - t.cpuUsage - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('CPU usage'), - t.memoryTotalBytes - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Memory total'), - t.memoryUsagePercent - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Memory usage'), - t.diskTotalRoot - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Root mount size'), - t.diskUsageRootPercent - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Root mount used'), - t.alertsCritical - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('CRITICAL'), - t.alertsWarning - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('WARNING'), - ], - description="All nodes' perfomance at a glance." - ) - + g.panel.table.options.withFooter( - value={ - reducer: ['sum'], - show: true, - fields: [ - 'Value #Cores', - 'Value #Load 1', - 'Value #Memory total', - 'Value #Root mount size', - ], - } - ) - + commonlib.panels.system.table.uptime.stylizeByName('Uptime') - + table.standardOptions.withOverridesMixin([ - fieldOverride.byRegexp.new('Product|^Hostname$') - + fieldOverride.byRegexp.withProperty('custom.filterable', true), - fieldOverride.byName.new('Instance') - + fieldOverride.byName.withProperty('custom.filterable', true) - + fieldOverride.byName.withProperty('links', [ - { - targetBlank: false, - title: 'Drill down to ${__field.name} ${__value.text}', - url: 'd/%s?var-%s=${__data.fields.%s}&${__url_time_range}&${datasource:queryparam}' % [this.grafana.dashboards.overview.uid, instanceLabel, instanceLabel], - }, - ]), - fieldOverride.byRegexp.new(std.join('|', std.map(utils.toSentenceCase, this.config.groupLabels))) - + fieldOverride.byRegexp.withProperty('custom.filterable', true) - + fieldOverride.byRegexp.withProperty('links', [ - { - targetBlank: false, - title: 'Filter by ${__field.name}', - url: 'd/%s?var-${__field.name}=${__value.text}&${__url_time_range}&${datasource:queryparam}' % [this.grafana.dashboards.fleet.uid], - }, - ]), - fieldOverride.byName.new('Cores') - + fieldOverride.byName.withProperty('custom.width', '120'), - fieldOverride.byName.new('CPU usage') - + fieldOverride.byName.withProperty('custom.width', '120') - + fieldOverride.byName.withProperty('custom.displayMode', 'basic') - + fieldOverride.byName.withPropertiesFromOptions( - commonlib.panels.cpu.timeSeries.utilization.stylize() - ), - fieldOverride.byName.new('Memory total') - + fieldOverride.byName.withProperty('custom.width', '120') - + fieldOverride.byName.withPropertiesFromOptions( - table.standardOptions.withUnit('bytes') - ), - fieldOverride.byName.new('Memory usage') - + fieldOverride.byName.withProperty('custom.width', '120') - + fieldOverride.byName.withProperty('custom.displayMode', 'basic') - + fieldOverride.byName.withPropertiesFromOptions( - commonlib.panels.cpu.timeSeries.utilization.stylize() - ), - fieldOverride.byName.new('Root mount size') - + fieldOverride.byName.withProperty('custom.width', '120') - + fieldOverride.byName.withPropertiesFromOptions( - table.standardOptions.withUnit('bytes') - ), - fieldOverride.byName.new('Root mount used') - + fieldOverride.byName.withProperty('custom.width', '120') - + fieldOverride.byName.withProperty('custom.displayMode', 'basic') - + fieldOverride.byName.withPropertiesFromOptions( - table.standardOptions.withUnit('percent') - ) - + fieldOverride.byName.withPropertiesFromOptions( - commonlib.panels.cpu.timeSeries.utilization.stylize() - ), - ]) - + table.queryOptions.withTransformationsMixin( - [ - { - id: 'joinByField', - options: { - byField: instanceLabel, - mode: 'outer', - }, - }, - { - id: 'filterFieldsByName', - options: { - include: { - //' 1' - would only match first occurence of group label, so no duplicates - pattern: instanceLabel + '|' - + - std.join( - '|', - std.map( - function(x) '%s 1' % x, this.config.instanceLabels - ) - ) - + '|' + - std.join( - '|', - std.map( - function(x) '%s 1' % x, this.config.groupLabels - ) - ) - + '|product|^hostname$|^nodename$|^pretty_name$|Value.+', - }, - }, - }, - { - id: 'organize', - options: { - excludeByName: { - 'Value #OS Info': true, - }, - indexByName: - { - [instanceLabel]: 0, - nodename: 1, - hostname: 1, - pretty_name: 2, - product: 2, - } - + - // group labels are named as 'job 1' and so on. - { - [label]: 3 - for label in this.config.groupLabels - }, - renameByName: - { - [label + ' 1']: utils.toSentenceCase(label) - for label in this.config.instanceLabels - } - { - [instanceLabel]: utils.toSentenceCase(instanceLabel), - product: 'OS', // windows - pretty_name: 'OS', // linux - hostname: 'Hostname', // windows - nodename: 'Hostname', // Linux - } - + - // group labels are named as 'job 1' and so on. - { - [label + ' 1']: utils.toSentenceCase(label) - for label in this.config.groupLabels - }, - - }, - }, - { - id: 'renameByRegex', - options: { - regex: 'Value #(.*)', - renamePattern: '$1', - }, - }, - ] - ), - uptime: commonlib.panels.system.stat.uptime.new(targets=[t.uptime]), - - systemLoad: - commonlib.panels.system.timeSeries.loadAverage.new( - loadTargets=[t.systemLoad1, t.systemLoad5, t.systemLoad15], - cpuCountTarget=t.cpuCount, - ), - - systemContextSwitchesAndInterrupts: - commonlib.panels.generic.timeSeries.base.new( - 'Context switches/Interrupts', - targets=[ - t.systemContextSwitches, - t.systemInterrupts, - ], - description=||| - Context switches occur when the operating system switches from running one process to another. Interrupts are signals sent to the CPU by external devices to request its attention. - - A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. - ||| - ), - - timeNtpStatus: - commonlib.panels.system.statusHistory.ntp.new( - 'NTP status', - targets=[t.timeNtpStatus], - description='Status of time synchronization.' - ) - + g.panel.timeSeries.standardOptions.withNoValue('No data.') - + g.panel.statusHistory.options.withLegend(false), - timeSyncDrift: - commonlib.panels.generic.timeSeries.base.new( - 'Time synchronized drift', - targets=[ - t.timeEstimatedError, - t.timeOffset, - t.timeMaxError, - ], - description=||| - Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. - ||| - ) - + g.panel.timeSeries.standardOptions.withUnit('seconds') - + g.panel.timeSeries.standardOptions.withNoValue('No data.'), - cpuCount: commonlib.panels.cpu.stat.count.new(targets=[t.cpuCount]), - cpuUsageTsPerCore: commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.cpuUsagePerCore]) - + g.panel.timeSeries.fieldConfig.defaults.custom.withStacking({ mode: 'normal' }), - - cpuUsageTopk: commonlib.panels.generic.timeSeries.topkPercentage.new( - title='CPU usage', - target=t.cpuUsage, - topk=25, - instanceLabels=this.config.instanceLabels, - drillDownDashboardUid=this.grafana.dashboards.overview.uid, - ), - cpuUsageStat: commonlib.panels.cpu.stat.usage.new(targets=[t.cpuUsage]), - cpuUsageByMode: commonlib.panels.cpu.timeSeries.utilizationByMode.new( - targets=[t.cpuUsageByMode], - description=||| - - System: Processes executing in kernel mode. - - User: Normal processes executing in user mode. - - Nice: Niced processes executing in user mode. - - Idle: Waiting for something to happen. - - Iowait: Waiting for I/O to complete. - - Irq: Servicing interrupts. - - Softirq: Servicing softirqs. - - Steal: Time spent in other operating systems when running in a virtualized environment. - ||| - ), - - memoryTotalBytes: commonlib.panels.memory.stat.total.new(targets=[t.memoryTotalBytes]), - memorySwapTotalBytes: - commonlib.panels.memory.stat.total.new( - 'Total swap', - targets=[t.memorySwapTotal], - description=||| - Total swap available. - - Swap is a space on a storage device (usually a dedicated swap partition or a swap file) - used as virtual memory when the physical RAM (random-access memory) is fully utilized. - Swap space helps prevent memory-related performance issues by temporarily transferring less-used data from RAM to disk, - freeing up physical memory for active processes and applications. - ||| - ), - memoryUsageStatPercent: commonlib.panels.memory.stat.usage.new(targets=[t.memoryUsagePercent]), - memotyUsageTopKPercent: commonlib.panels.generic.timeSeries.topkPercentage.new( - title='Memory usage', - target=t.memoryUsagePercent, - topk=25, - instanceLabels=this.config.instanceLabels, - drillDownDashboardUid=this.grafana.dashboards.overview.uid, - ), - memoryUsageTsBytes: - commonlib.panels.memory.timeSeries.usageBytes.new( - targets=[ - t.memoryUsedBytes, - t.memoryCachedBytes, - t.memoryAvailableBytes, - t.memoryBuffersBytes, - t.memoryFreeBytes, - t.memoryTotalBytes, - ], - description= - ||| - - Used: The amount of physical memory currently in use by the system. - - Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. - - Free: The amount of physical memory that is currently not in use. - - Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. - - Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. - ||| - ) - + g.panel.timeSeries.standardOptions.withOverridesMixin( - { - __systemRef: 'hideSeriesFrom', - matcher: { - id: 'byNames', - options: { - mode: 'exclude', - names: [ - t.memoryTotalBytes.legendFormat, - t.memoryUsedBytes.legendFormat, - ], - prefix: 'All except:', - readOnly: true, - }, - }, - properties: [ - { - id: 'custom.hideFrom', - value: { - viz: true, - legend: false, - tooltip: false, - }, - }, - ], - } - ), - - memoryPagesInOut: - commonlib.panels.memory.timeSeries.base.new( - 'Memory pages in / out', - targets=[t.memoryPagesIn, t.memoryPagesOut], - description=||| - Page-In - Return of pages to physical memory. This is a common and normal event. - - Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. - When the kernel detects low memory, it attempts to free memory by paging out. - While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. - Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. - ||| - ) - + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), - - memoryPagesSwapInOut: - commonlib.panels.memory.timeSeries.base.new( - 'Memory pages swapping in / out', - targets=[t.memoryPagesSwapIn, t.memoryPagesSwapOut], - description=||| - Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. - Nonetheless, it is a preferable option to crashing or killing off processes. - - The process of writing pages out to disk to free memory is known as swapping-out. - If a page fault occurs because the page is on disk, in the swap area, rather than in memory, - the kernel will read the page back in from the disk to satisfy the page fault. - This is known as swapping-in. - ||| - ) - + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), - - memoryPagesFaults: - commonlib.panels.memory.timeSeries.base.new( - 'Memory page faults', - targets=[t.memoryPageMajorFaults, t.memoryPageMinorFaults], - description=||| - A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, - requiring a mapping to be added to the process's virtual address space. - - The page contents may also need to be loaded from a backing store such as a disk. - While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. - Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. - |||, - ), - - memoryOOMkiller: - commonlib.panels.memory.timeSeries.base.new( - 'OOM Killer', - targets=[t.memoryOOMkiller], - description=||| - Out Of Memory killer is a process used by the Linux kernel when the system is running critically low on memory. - - This can happen when the kernel has allocated more memory than is available for its processes. - ||| - ), - - memoryActiveInactive: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory active / inactive', - targets=[t.memoryActiveBytes, t.memoryInactiveBytes], - description=||| - - Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. - - Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. - |||, - ), - - memoryActiveInactiveDetail: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory active / inactive details', - targets=[t.memoryInactiveFile, t.memoryInactiveAnon, t.memoryActiveFile, t.memoryActiveAnon], - description=||| - - Inactive_file: File-backed memory on inactive LRU list. - - Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). - - Active_file: File-backed memory on active LRU list. - - Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. - |||, - ), - - memoryCommited: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory commited', - targets=[t.memoryCommitedAs, t.memoryCommitedLimit], - description=||| - - Committed_AS - Amount of memory presently allocated on the system. - - CommitLimit - Amount of memory currently available to be allocated on the system. - ||| - ), - - memorySharedAndMapped: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory shared and mapped', - targets=[t.memoryMappedBytes, t.memoryShmemBytes, t.memoryShmemPmdMappedBytes, t.memoryShmemHugePagesBytes], - description=||| - - Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. - - Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. - - ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. - - ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. - ||| - ), - memoryWriteAndDirty: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory writeback and dirty', - targets=[t.memoryWriteback, t.memoryWritebackTmp, t.memoryDirty], - description=||| - - Writeback: This refers to the memory that is currently being actively written back to the disk. - - WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. - - Dirty: This type of memory is waiting to be written back to the disk. - ||| - ), - memoryVmalloc: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory Vmalloc', - targets=[t.memoryVmallocChunk, t.memoryVmallocTotal, t.memoryVmallocUsed], - description=||| - Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. - - - VmallocChunk: Largest contiguous block of vmalloc area which is free. - - VmallocTotal: Total size of vmalloc memory area. - - VmallocUsed: Amount of vmalloc area which is used. - ||| - ), - memorySlab: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory slab', - targets=[t.memorySlabSUnreclaim, t.memorySlabSReclaimable], - description=||| - Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. - - The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. - - - SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. - - SReclaimable: Part of Slab, that might be reclaimed, such as caches. - ||| - ), - memoryAnonymous: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory slab', - targets=[t.memoryAnonHugePages, t.memoryAnonPages], - description=||| - Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. - - This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. - - Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, - and from Memory Shared with other processes, - which refers to memory regions that can be accessed and modified by multiple processes. - - - AnonHugePages: Memory in anonymous huge pages. - - AnonPages: Memory in user pages not backed by files. - ||| - ), - - memoryHugePagesCounter: - commonlib.panels.memory.timeSeries.base.new( - 'Memory HugePages counter', - targets=[t.memoryHugePages_Free, t.memoryHugePages_Rsvd, t.memoryHugePages_Surp], - description= - ||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - - - HugePages_Free: Huge pages in the pool that are not yet allocated. - - HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. - - HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. - ||| - ), - memoryHugePagesSize: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory HugePages size', - targets=[t.memoryHugePagesTotalSize, t.memoryHugePagesSize], - - description=||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - ||| - ), - - memoryDirectMap: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory direct map', - targets=[t.memoryDirectMap1G, t.memoryDirectMap2M, t.memoryDirectMap4k], - - description=||| - Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. - ||| - ), - memoryBounce: - commonlib.panels.memory.timeSeries.usageBytes.new( - 'Memory bounce', - targets=[t.memoryBounce], - description=||| - Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. - - To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. - - - Bounce: Memory used for block device bounce buffers. - ||| - ), - diskTotalRoot: - commonlib.panels.disk.stat.total.new( - 'Root mount size', - targets=[t.diskTotalRoot], - description=||| - Total capacity on the primary mount point /. - ||| - ), - diskUsage: - commonlib.panels.disk.table.usage.new( - totalTarget= - ( - t.diskTotal - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - ), - freeTarget= - t.diskFree - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true), - groupLabel='mountpoint' - , - description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.' - ), - diskFreeTs: - commonlib.panels.disk.timeSeries.available.new( - 'Filesystem space availabe', - targets=[ - t.diskFree, - ], - description='Filesystem space utilisation in bytes, by mountpoint.' - ), - diskInodesFree: - commonlib.panels.disk.timeSeries.base.new( - 'Free inodes', - targets=[t.diskInodesFree], - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.' - ) - + g.panel.timeSeries.standardOptions.withUnit('short'), - diskInodesTotal: - commonlib.panels.disk.timeSeries.base.new( - 'Total inodes', - targets=[t.diskInodesTotal], - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', - ) - + g.panel.timeSeries.standardOptions.withUnit('short'), - diskErrorsandRO: - commonlib.panels.disk.timeSeries.base.new( - 'Filesystems with errors / read-only', - targets=[ - t.diskDeviceError, - t.diskReadOnly, - ], - description='', - ) - + g.panel.timeSeries.standardOptions.withMax(1), - fileDescriptors: - commonlib.panels.disk.timeSeries.base.new( - 'File descriptors', - targets=[ - t.processMaxFds, - t.processOpenFds, - ], - description=||| - File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. - The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. - ||| - ), - diskUsagePercentTopK: commonlib.panels.generic.timeSeries.topkPercentage.new( - title='Disk space usage', - target=t.diskUsagePercent, - topk=25, - instanceLabels=this.config.instanceLabels + ['volume'], - drillDownDashboardUid=this.grafana.dashboards.overview.uid, - ), - diskIOBytesPerSec: commonlib.panels.disk.timeSeries.ioBytesPerSec.new( - targets=[t.diskIOreadBytesPerSec, t.diskIOwriteBytesPerSec, t.diskIOutilization] - ), - diskIOutilPercentTopK: - commonlib.panels.generic.timeSeries.topkPercentage.new( - title='Disk IO', - target=t.diskIOutilization, - topk=25, - instanceLabels=this.config.instanceLabels + ['volume'], - drillDownDashboardUid=this.grafana.dashboards.overview.uid, - ), - diskIOps: - commonlib.panels.disk.timeSeries.iops.new( - targets=[ - t.diskIOReads, - t.diskIOWrites, - ] - ), - - diskQueue: - commonlib.panels.disk.timeSeries.ioQueue.new( - 'Disk average queue', - targets= - [ - t.diskAvgQueueSize, - ] - ), - diskIOWaitTime: commonlib.panels.disk.timeSeries.ioWaitTime.new( - targets=[ - t.diskIOWaitReadTime, - t.diskIOWaitWriteTime, - ] - ), - osInfo: commonlib.panels.generic.stat.info.new( - 'OS', - targets=[t.osInfo], - description='Operating system' - ) - { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, - kernelVersion: - commonlib.panels.generic.stat.info.new('Kernel version', - targets=[t.unameInfo], - description='Kernel version of linux host.') - { options+: { reduceOptions+: { fields: '/^release$/' } } }, - osTimezone: - commonlib.panels.generic.stat.info.new( - 'Timezone', targets=[t.osTimezone], description='Current system timezone.' - ) - { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, - hostname: - commonlib.panels.generic.stat.info.new( - 'Hostname', - targets=[t.unameInfo], - description="System's hostname." - ) - { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, - networkErrorsAndDroppedPerSec: - commonlib.panels.network.timeSeries.errors.new( - 'Network errors and dropped packets', - targets=std.map( - function(t) t - { - expr: t.expr + '>0', - }, - [ - t.networkOutErrorsPerSec, - t.networkInErrorsPerSec, - t.networkOutDroppedPerSec, - t.networkInDroppedPerSec, - ] - ), - description=||| - **Network errors**: - - Network errors refer to issues that occur during the transmission of data across a network. - - These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. - - Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. - - **Dropped packets**: - - Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. - - Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. - - Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. - ||| - ) - + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), - networkErrorsAndDroppedPerSecTopK: - commonlib.panels.network.timeSeries.errors.new( - 'Network errors and dropped packets', - targets=std.map( - function(t) t - { - expr: 'topk(25, ' + t.expr + ')>0', - legendFormat: '{{' + this.config.instanceLabels[0] + '}}: ' + std.get(t, 'legendFormat', '{{ nic }}'), - }, - [ - t.networkOutErrorsPerSec, - t.networkInErrorsPerSec, - t.networkOutDroppedPerSec, - t.networkInDroppedPerSec, - ] - ), - description=||| - Top 25. - - **Network errors**: - - Network errors refer to issues that occur during the transmission of data across a network. - - These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. - - Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. - - **Dropped packets**: - - Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. - - Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. - - Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. - ||| - ) - + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('points') - + g.panel.timeSeries.fieldConfig.defaults.custom.withPointSize(5), - - networkErrorsPerSec: - commonlib.panels.network.timeSeries.errors.new( - 'Network errors', - targets=[t.networkInErrorsPerSec, t.networkOutErrorsPerSec] - ) - + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), - networkDroppedPerSec: - commonlib.panels.network.timeSeries.dropped.new( - targets=[t.networkInDroppedPerSec, t.networkOutDroppedPerSec] - ) - + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), - networkUsagePerSec: - commonlib.panels.network.timeSeries.traffic.new( - targets=[t.networkInBitPerSecFiltered, t.networkOutBitPerSecFiltered] - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), - networkPacketsPerSec: - commonlib.panels.network.timeSeries.packets.new( - targets=[t.networkInPacketsPerSec, t.networkOutPacketsPerSec] - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), - networkMulticastPerSec: - commonlib.panels.network.timeSeries.multicast.new( - 'Multicast packets', - targets=[t.networkInMulticastPacketsPerSec, t.networkOutMulticastPacketsPerSec], - description='Multicast packets received and transmitted.' - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), - - networkFifo: - commonlib.panels.network.timeSeries.packets.new( - 'Network FIFO', - targets=[t.networkFifoInPerSec, t.networkFifoOutPerSec], - description=||| - Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. - It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. - Packets are stored in the FIFO buffer until they can be transmitted or processed further. - ||| - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), - networkCompressedPerSec: - commonlib.panels.network.timeSeries.packets.new( - 'Compressed packets', - targets=[t.networkCompressedInPerSec, t.networkCompressedOutPerSec], - description=||| - - Compressed received: - Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - - Compressed transmitted: - Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - https://docs.kernel.org/networking/statistics.html - |||, - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), - networkNFConntrack: - commonlib.panels.generic.timeSeries.base.new( - 'NF conntrack', - targets=[t.networkNFConntrackEntries, t.networkNFConntrackLimits], - description=||| - NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, - enforce firewall rules, perform NAT, and manage network address/port translation. - ||| - ) - + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0), - - networkSoftnet: - commonlib.panels.network.timeSeries.packets.new( - 'Softnet packets', - targets=[t.networkSoftnetProcessedPerSec, t.networkSoftnetDroppedPerSec], - description=||| - Softnet packets are received by the network and queued for processing by the kernel's networking stack. - Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. - ||| - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets('/dropped/') - + g.panel.timeSeries.fieldConfig.defaults.custom.withAxisLabel('Dropped(-) | Processed(+)'), - networkSoftnetSqueeze: - commonlib.panels.network.timeSeries.packets.new( - 'Softnet out of quota', - targets=[t.networkSoftnetSqueezedPerSec], - description=||| - "Softnet out of quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. - This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. - ||| - ), - networkOperStatus: - commonlib.panels.network.statusHistory.interfaceStatus.new( - 'Network interfaces carrier status', - targets=[t.networkCarrier], - description='Network interfaces carrier status', - ), - networkOverviewTable: - commonlib.panels.generic.table.base.new( - 'Network interfaces overview', - targets= - [ - t.networkUp - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Up'), - t.networkCarrier - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Carrier'), - t.networkOutBitPerSec - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(false) - + g.query.prometheus.withRefId('Transmitted'), - t.networkInBitPerSec - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(false) - + g.query.prometheus.withRefId('Received'), - t.networkArpEntries - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('ARP entries'), - t.networkMtuBytes - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('MTU'), - t.networkSpeedBitsPerSec - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Speed'), - t.networkTransmitQueueLength - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Queue length'), - t.networkInfo - + g.query.prometheus.withFormat('table') - + g.query.prometheus.withInstant(true) - + g.query.prometheus.withRefId('Info'), - ], - description='Network interfaces overview.' - ) - + g.panel.table.standardOptions.withOverridesMixin([ - fieldOverride.byName.new('Speed') - + fieldOverride.byName.withPropertiesFromOptions( - table.standardOptions.withUnit('bps') - ), - ]) - + g.panel.table.standardOptions.withOverridesMixin([ - fieldOverride.byRegexp.new('Transmitted|Received') - + fieldOverride.byRegexp.withProperty('custom.displayMode', 'gradient-gauge') - + fieldOverride.byRegexp.withPropertiesFromOptions( - table.standardOptions.withUnit('bps') - + table.standardOptions.color.withMode('continuous-BlYlRd') - + table.standardOptions.withMax(1000 * 1000 * 100) - ), - ]) - + g.panel.table.standardOptions.withOverridesMixin([ - fieldOverride.byRegexp.new('Carrier|Up') - + fieldOverride.byRegexp.withProperty('custom.displayMode', 'color-text') - + fieldOverride.byRegexp.withPropertiesFromOptions( - table.standardOptions.withMappings( - { - type: 'value', - options: { - '0': { - text: 'Down', - color: 'light-red', - index: 0, - }, - '1': { - text: 'Up', - color: 'light-green', - index: 1, - }, - }, - } - ), - ), - ]) - + table.queryOptions.withTransformationsMixin( - [ - { - id: 'joinByField', - options: { - byField: 'device', - mode: 'outer', - }, - }, - { - id: 'filterFieldsByName', - options: { - include: { - pattern: 'device|duplex|address|Value.+', - }, - }, - }, - { - id: 'renameByRegex', - options: { - regex: '(Value) #(.*)', - renamePattern: '$2', - }, - }, - { - id: 'organize', - options: { - excludeByName: { - Info: true, - }, - renameByName: - { - device: 'Interface', - duplex: 'Duplex', - address: 'Address', - }, - }, - }, - { - id: 'organize', - options: { - indexByName: { - Interface: 0, - Up: 1, - Carrier: 2, - Received: 3, - Transmitted: 4, - }, - }, - }, - ] - ), - networkSockstatAll: - commonlib.panels.generic.timeSeries.base.new( - 'Sockets in use', - targets=[t.networkSocketsUsed], - description='Number of sockets currently in use.', - ), - - networkSockstatTCP: - commonlib.panels.generic.timeSeries.base.new( - 'Sockets TCP', - targets=[t.networkSocketsTCPAllocated, t.networkSocketsTCPIPv4, t.networkSocketsTCPIPv6, t.networkSocketsTCPOrphans, t.networkSocketsTCPTimeWait], - description=||| - TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. - - Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. - ||| - ), - networkSockstatUDP: - commonlib.panels.generic.timeSeries.base.new( - 'Sockets UDP', - targets=[t.networkSocketsUDPLiteInUse, t.networkSocketsUDPInUse, t.networkSocketsUDPLiteIPv6InUse, t.networkSocketsUDPIPv6InUse], - description=||| - UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. - Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. - ||| - ), - networkSockstatOther: - commonlib.panels.generic.timeSeries.base.new( - 'Sockets other', - targets=[t.networkSocketsFragInUse, t.networkSocketsFragIPv6InUse, t.networkSocketsRawInUse, t.networkSocketsIPv6RawInUse], - description=||| - FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. - - RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. - ||| - ), - networkSockstatMemory: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.generic.timeSeries.base.new( - title='Sockets memory', - targets=[t.networkSocketsTCPMemoryPages, t.networkSocketsUDPMemoryPages, t.networkSocketsTCPMemoryBytes, t.networkSocketsUDPMemoryBytes], - description=||| - Memory currently in use for sockets. - |||, - ) - + panel.queryOptions.withMaxDataPoints(100) - + panel.fieldConfig.defaults.custom.withAxisLabel('Pages') - + panel.standardOptions.withOverridesMixin( - panel.standardOptions.override.byRegexp.new('/bytes/') - + override.byType.withPropertiesFromOptions( - panel.standardOptions.withDecimals(2) - + panel.standardOptions.withUnit('bytes') - + panel.fieldConfig.defaults.custom.withDrawStyle('bars') - + panel.fieldConfig.defaults.custom.withStacking(value={ mode: 'normal', group: 'A' }) - + panel.fieldConfig.defaults.custom.withAxisLabel('Bytes') - ) - ), - - networkNetstatIP: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.packets.new( - 'IP octets', - targets=[t.networkNetstatIPInOctetsPerSec, t.networkNetstatIPOutOctetsPerSec], - description='Rate of IP octets received and transmitted.' - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() - + panel.standardOptions.withUnit('oct/s'), - - networkNetstatTCP: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.packets.new( - 'TCP segments', - targets=[t.networkNetstatTCPInSegmentsPerSec, t.networkNetstatTCPOutSegmentsPerSec], - description='Rate of TCP segments received and transmitted.' - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() - + panel.standardOptions.withUnit('seg/s'), - - networkNetstatTCPerrors: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.errors.new( - title='TCP errors rate', - targets=[ - t.networkNetstatTCPOverflowPerSec, - t.networkNetstatTCPListenDropsPerSec, - t.networkNetstatTCPRetransPerSec, - t.networkNetstatTCPRetransSegPerSec, - t.networkNetstatTCPInWithErrorsPerSec, - t.networkNetstatTCPOutWithRstPerSec, - ], - description='Rate of TCP errors.' - ) - + panel.standardOptions.withUnit('err/s'), - - networkNetstatUDP: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.packets.new( - 'UDP datagrams', - targets=[ - t.networkNetstatIPInUDPPerSec, - t.networkNetstatIPOutUDPPerSec, - t.networkNetstatIPInUDP6PerSec, - t.networkNetstatIPOutUDP6PerSec, - ], - description='Rate of UDP datagrams received and transmitted.' - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() - + panel.standardOptions.withUnit('dat/s'), - - networkNetstatUDPerrors: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.errors.new( - title='UDP errors rate', - targets=[ - t.networkNetstatUDPLiteInErrorsPerSec, - t.networkNetstatUDPInErrorsPerSec, - t.networkNetstatUDP6InErrorsPerSec, - t.networkNetstatUDPNoPortsPerSec, - t.networkNetstatUDP6NoPortsPerSec, - t.networkNetstatUDPRcvBufErrsPerSec, - t.networkNetstatUDP6RcvBufErrsPerSec, - t.networkNetstatUDPSndBufErrsPerSec, - t.networkNetstatUDP6SndBufErrsPerSec, - ], - description='Rate of UDP errors.' - ) - + panel.standardOptions.withUnit('err/s'), - - networkNetstatICMP: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.packets.new( - 'ICMP messages', - targets=[ - t.networkNetstatICMPInPerSec, - t.networkNetstatICMPOutPerSec, - t.networkNetstatICMP6InPerSec, - t.networkNetstatICMP6OutPerSec, - ], - description="Rate of ICMP messages, like 'ping', received and transmitted." - ) - + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() - + panel.standardOptions.withUnit('msg/s'), - - networkNetstatICMPerrors: - local panel = g.panel.timeSeries; - local override = g.panel.timeSeries.standardOptions.override; - commonlib.panels.network.timeSeries.errors.new( - title='ICMP errors rate', - targets=[ - t.networkNetstatICMPInErrorsPerSec, - t.networkNetstatICM6PInErrorsPerSec, - ], - description='Rate of ICMP messages received and transmitted with errors.' - ) - + panel.standardOptions.withUnit('err/s'), - - hardwareTemperature: - commonlib.panels.hardware.timeSeries.temperature.new( - 'Temperature', - targets=[t.hardwareTemperature] - ), - }, -} diff --git a/docs/node-observ-lib/linux/targets.libsonnet b/docs/node-observ-lib/linux/targets.libsonnet deleted file mode 100644 index cc9dd8e94c..0000000000 --- a/docs/node-observ-lib/linux/targets.libsonnet +++ /dev/null @@ -1,1147 +0,0 @@ -local g = import '../g.libsonnet'; -local prometheusQuery = g.query.prometheus; -local lokiQuery = g.query.loki; - -{ - new(this): { - local variables = this.grafana.variables, - local config = this.config, - local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', - local lokiDatasource = '${' + variables.datasources.loki.name + '}', - uptimeQuery:: 'node_boot_time_seconds', - - reboot: - prometheusQuery.new( - prometheusDatasource, - self.uptimeQuery + '{%(queriesSelector)s}*1000 > $__from < $__to' % variables, - ), - - serviceFailed: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="init.scope"} |= "code=exited, status=1/FAILURE"' % variables - ), - // those events should be rare, so can be shown as annotations - criticalEvents: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, transport="kernel", level="emerg"}' % variables - ), - memoryOOMkiller: - prometheusQuery.new( - prometheusDatasource, - 'increase(node_vmstat_oom_kill{%(queriesSelector)s}[$__interval])' % variables, - ) - + prometheusQuery.withLegendFormat('OOM killer invocations'), - - kernelUpdate: - prometheusQuery.new( - prometheusDatasource, - expr=||| - changes( - sum by (%(instanceLabels)s) ( - group by (%(instanceLabels)s,release) (node_uname_info{%(queriesSelector)s}) - ) - [$__interval:1m] offset -$__interval) > 1 - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - - // new interactive session in logs: - sessionOpened: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="systemd-logind.service"}|= "New session"' % variables - ), - sessionClosed: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="systemd-logind.service"} |= "logged out"' % variables - ), - - alertsCritical: - prometheusQuery.new( - prometheusDatasource, - 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - alertsWarning: - prometheusQuery.new( - prometheusDatasource, - 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - - uptime: - prometheusQuery.new( - prometheusDatasource, - 'time() - ' + self.uptimeQuery + '{%(queriesSelector)s}' % variables - ), - cpuCount: - prometheusQuery.new( - prometheusDatasource, - 'count without (cpu) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"})' % variables - ) - + prometheusQuery.withLegendFormat('Cores'), - cpuUsage: - prometheusQuery.new( - prometheusDatasource, - ||| - (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s))) - - - avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(queriesSelector)s}[$__rate_interval])))) * 100) - / - count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s)) - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ) - + prometheusQuery.withLegendFormat('CPU usage'), - cpuUsagePerCore: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(queriesSelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"}) - ) * 100 - ||| % variables, - ) - + prometheusQuery.withLegendFormat('CPU {{cpu}}'), - cpuUsageByMode: - prometheusQuery.new( - prometheusDatasource, - ||| - sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval])) - / on(%(instanceLabels)s) - group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval]))) * 100 - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ) - + prometheusQuery.withLegendFormat('{{ mode }}'), - memoryTotalBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemTotal_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory total'), - memoryFreeBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemFree_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory free'), - memoryAvailableBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemAvailable_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory available'), - memoryCachedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Cached_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory cached'), - memoryBuffersBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Buffers_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory buffers'), - memoryUsedBytes: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - node_memory_MemTotal_bytes{%(queriesSelector)s} - - - node_memory_MemFree_bytes{%(queriesSelector)s} - - - node_memory_Buffers_bytes{%(queriesSelector)s} - - - node_memory_Cached_bytes{%(queriesSelector)s} - ) - ||| % variables - ) - + prometheusQuery.withLegendFormat('Memory used'), - memoryUsagePercent: - prometheusQuery.new( - prometheusDatasource, - ||| - 100 - - ( - avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(queriesSelector)s}) / - avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(queriesSelector)s}) - * 100 - ) - ||| - % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - memorySwapTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SwapTotal_bytes{%(queriesSelector)s}' % variables - ), - memoryPagesIn: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgpgin{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Page-In'), - memoryPagesOut: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgpgout{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Page-Out'), - - memoryPagesSwapIn: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pswpin{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Pages swapped in'), - memoryPagesSwapOut: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pswpout{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Pages swapped out'), - - memoryPageMajorFaults: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Major page fault operations'), - memoryPageMinorFaults: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_vmstat_pgfault{%(queriesSelector)s}[$__rate_interval]) - - - irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval]) - ||| % variables, - ) - + prometheusQuery.withLegendFormat('Minor page fault operations'), - - memoryInactiveBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive'), - memoryActiveBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active'), - - memoryInactiveFile: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_file_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive_file'), - - memoryInactiveAnon: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_anon_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive_anon'), - - memoryActiveFile: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_file_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active_file'), - - memoryActiveAnon: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_anon_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active_anon'), - - memoryCommitedAs: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Committed_AS_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Commited_AS'), - memoryCommitedLimit: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_CommitLimit_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('CommitLimit'), - - memoryMappedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Mapped_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Mapped'), - memoryShmemBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Shmem_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Shmem'), - memoryShmemHugePagesBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_ShmemHugePages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('ShmemHugePages'), - memoryShmemPmdMappedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_ShmemPmdMapped_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('ShmemPmdMapped'), - memoryWriteback: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Writeback_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Writeback'), - memoryWritebackTmp: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_WritebackTmp_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('WritebackTmp'), - memoryDirty: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Dirty_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Dirty'), - - memoryVmallocChunk: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocChunk_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocChunk'), - memoryVmallocTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocTotal_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocTotal'), - memoryVmallocUsed: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocUsed_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocUsed'), - memorySlabSUnreclaim: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SUnreclaim_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('SUnreclaim'), - memorySlabSReclaimable: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SReclaimable_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('SReclaimable'), - - memoryAnonHugePages: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_AnonHugePages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('AnonHugePages'), - memoryAnonPages: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_AnonPages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('AnonPages'), - - memoryHugePages_Free: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Free{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Free'), - memoryHugePages_Rsvd: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Rsvd{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Rsvd'), - memoryHugePages_Surp: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Surp{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Surp'), - memoryHugePagesTotalSize: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Total{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Huge pages total size'), - memoryHugePagesSize: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Hugepagesize_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Huge page size'), - memoryDirectMap1G: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap1G_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap1G'), - memoryDirectMap2M: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap2M_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap2M'), - memoryDirectMap4k: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap4k_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap4k'), - memoryBounce: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Bounce_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Bounce'), - - diskTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ), - diskTotalRoot: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_size_bytes{%(queriesSelector)s, mountpoint="/", fstype!="rootfs"}' % variables, - ), - diskUsageRoot: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_avail_bytes{%(queriesSelector)s, mountpoint="/",fstype!="rootfs"}' % variables - ), - diskUsageRootPercent: - prometheusQuery.new( - prometheusDatasource, - '100 - node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}/node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}*100' % variables - ), - diskFree: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} free'), - diskUsagePercent: - prometheusQuery.new( - prometheusDatasource, - '100 - node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}/node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}*100' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} used, %'), - - diskInodesFree: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_files_free{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector }, - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes free'), - diskInodesTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_files{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes total'), - diskReadOnly: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_readonly{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} read-only'), - diskDeviceError: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_device_error{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} device error'), - // descriptors - processMaxFds: - prometheusQuery.new( - prometheusDatasource, - 'process_max_fds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Maximum open file descriptors'), - processOpenFds: - prometheusQuery.new( - prometheusDatasource, - 'process_open_fds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Open file descriptors'), - - // disk(device) - diskIOreadBytesPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_read_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} read'), - diskIOwriteBytesPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_written_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} written'), - diskIOutilization: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_io_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} io util'), - diskAvgQueueSize: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_io_time_weighted_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} avg queue'), - - diskIOWaitWriteTime: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_write_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} avg write time'), - diskIOWaitReadTime: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_read_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} avg read time'), - diskIOReads: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} reads'), - diskIOWrites: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} writes'), - - unameInfo: - prometheusQuery.new( - prometheusDatasource, - 'node_uname_info{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withFormat('table'), - osInfo: - prometheusQuery.new( - prometheusDatasource, - ||| - node_os_info{%(queriesSelector)s} - ||| % variables, - ) - + prometheusQuery.withFormat('table'), - osInfoCombined: - prometheusQuery.new( - prometheusDatasource, - ||| - node_uname_info{%(queriesSelector)s} - * on (%(groupLabels)s,%(instanceLabels)s) - group_left(pretty_name) - node_os_info{%(queriesSelector)s} - ||| % variables { - instanceLabels: std.join(',', this.config.instanceLabels), - groupLabels: std.join(',', this.config.groupLabels), - }, - ) - + prometheusQuery.withFormat('table'), - - osTimezone: //timezone label - prometheusQuery.new( - prometheusDatasource, - 'node_time_zone_offset_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withFormat('table'), - - systemLoad1: - prometheusQuery.new( - prometheusDatasource, - 'node_load1{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('1m'), - systemLoad5: - prometheusQuery.new( - prometheusDatasource, - 'node_load5{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('5m'), - systemLoad15: - prometheusQuery.new( - prometheusDatasource, - 'node_load15{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('15m'), - - systemContextSwitches: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_context_switches_total{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Context switches'), - - systemInterrupts: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_intr_total{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Interrupts'), - - timeNtpStatus: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_sync_status{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('NTP status'), - - timeOffset: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_offset_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Time offset'), - - timeEstimatedError: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_estimated_error_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Estimated error in seconds'), - timeMaxError: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_maxerror_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Maximum error in seconds'), - - networkUp: - prometheusQuery.new( - prometheusDatasource, - 'node_network_up{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('{{device}}'), - networkCarrier: - prometheusQuery.new( - prometheusDatasource, - 'node_network_carrier{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('{{device}}'), - networkArpEntries: - prometheusQuery.new( - prometheusDatasource, - 'node_arp_entries{%(queriesSelector)s}' % variables, - ), - networkMtuBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_network_mtu_bytes{%(queriesSelector)s}' % variables, - ), - networkSpeedBitsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'node_network_speed_bytes{%(queriesSelector)s} * 8' % variables, - ), - networkTransmitQueueLength: - prometheusQuery.new( - prometheusDatasource, - 'node_network_transmit_queue_length{%(queriesSelector)s}' % variables, - ), - networkInfo: - prometheusQuery.new( - prometheusDatasource, - 'node_network_info{%(queriesSelector)s}' % variables, - ), - - networkOutBitPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkInBitPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutBitPerSecFiltered: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 - # only show interfaces that had traffic change at least once during selected dashboard interval: - and - increase( - node_network_transmit_bytes_total{%(queriesSelector)s}[$__range] - ) > 0 - ||| % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkInBitPerSecFiltered: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 - # only show interfaces that had traffic change at least once during selected dashboard interval: - and - increase( - node_network_receive_bytes_total{%(queriesSelector)s}[$__range] - ) > 0 - ||| % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - - - networkOutErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} errors transmitted'), - networkInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} errors received'), - networkOutDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted dropped'), - networkInDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received dropped'), - - networkInPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkInMulticastPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutMulticastPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkFifoInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkFifoOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkCompressedInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkCompressedOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkNFConntrackEntries: - prometheusQuery.new( - prometheusDatasource, - 'node_nf_conntrack_entries{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('NF conntrack entries'), - networkNFConntrackLimits: - prometheusQuery.new( - prometheusDatasource, - 'node_nf_conntrack_entries_limit{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('NF conntrack limits'), - - networkSoftnetProcessedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_processed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} processed'), - networkSoftnetDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_dropped_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} dropped'), - networkSoftnetSqueezedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_times_squeezed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} out of quota'), - - networkSocketsUsed: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_sockets_used{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 sockets in use'), - networkSocketsTCPAllocated: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_alloc{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Allocated'), - networkSocketsTCPIPv6: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 in use'), - networkSocketsTCPIPv4: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 in use'), - networkSocketsTCPOrphans: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_orphan{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Orphan sockets'), - networkSocketsTCPTimeWait: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_tw{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Time wait'), - - networkSocketsUDPLiteInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDPLITE_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 UDPLITE in use'), - networkSocketsUDPInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 UDP in use'), - networkSocketsUDPLiteIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDPLITE6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 UDPLITE in use'), - networkSocketsUDPIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 UDP in use'), - - networkSocketsFragInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_FRAG_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 Frag sockets in use'), - networkSocketsFragIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_FRAG6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 Frag sockets in use'), - networkSocketsRawInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_RAW_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 Raw sockets in use'), - networkSocketsIPv6RawInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_RAW6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 Raw sockets in use'), - - networkSocketsTCPMemoryPages: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_mem{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory pages allocated for TCP sockets'), - networkSocketsUDPMemoryPages: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_mem{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory pages allocated for UDP sockets'), - - networkSocketsTCPMemoryBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_mem_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory bytes allocated for TCP sockets'), - networkSocketsUDPMemoryBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_mem_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory bytes allocated for UDP sockets'), - - networkNetstatIPInOctetsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_IpExt_InOctets{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('Octets received'), - networkNetstatIPOutOctetsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_IpExt_OutOctets{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('Octets transmitted'), - - networkNetstatTCPInSegmentsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_InSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP received'), - networkNetstatTCPOutSegmentsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_OutSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP transmitted'), - - networkNetstatTCPOverflowPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_ListenOverflows{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP overflow'), - - networkNetstatTCPListenDropsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_ListenDrops{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP ListenDrops - SYNs to LISTEN sockets ignored'), - - networkNetstatTCPRetransPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_TCPSynRetrans{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP SYN rentransmits'), - - networkNetstatTCPRetransSegPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_RetransSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP retransmitted segments, containing one or more previously transmitted octets'), - networkNetstatTCPInWithErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_InErrs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP received with errors'), - - networkNetstatTCPOutWithRstPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_OutRsts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP segments sent with RST flag'), - - networkNetstatIPInUDPPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP received'), - - networkNetstatIPOutUDPPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP transmitted'), - - networkNetstatIPInUDP6PerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 received'), - - networkNetstatIPOutUDP6PerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 transmitted'), - - //UDP errors - networkNetstatUDPLiteInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_UdpLite_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDPLite InErrors'), - - networkNetstatUDPInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP InErrors'), - networkNetstatUDP6InErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 InErrors'), - networkNetstatUDPNoPortsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP NoPorts'), - networkNetstatUDP6NoPortsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 NoPorts'), - networkNetstatUDPRcvBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP receive buffer errors'), - networkNetstatUDP6RcvBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 receive buffer errors'), - networkNetstatUDPSndBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP transmit buffer errors'), - networkNetstatUDP6SndBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 transmit buffer errors'), - - //ICMP - networkNetstatICMPInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP received'), - networkNetstatICMPOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP transmitted'), - networkNetstatICMP6InPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 received'), - networkNetstatICMP6OutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 transmitted'), - - networkNetstatICMPInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 errors'), - networkNetstatICM6PInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 errors'), - - hardwareTemperature: - prometheusQuery.new( - prometheusDatasource, - 'node_hwmon_temp_celsius{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('{{chip}}/{{sensor}}'), - - }, -} diff --git a/docs/node-observ-lib/linux/variables.libsonnet b/docs/node-observ-lib/linux/variables.libsonnet deleted file mode 100644 index e2f1ace8c4..0000000000 --- a/docs/node-observ-lib/linux/variables.libsonnet +++ /dev/null @@ -1,71 +0,0 @@ -// variables.libsonnet -local g = import '../g.libsonnet'; -local var = g.dashboard.variable; -local commonlib = import 'common-lib/common/main.libsonnet'; -local utils = commonlib.utils; - -{ - new( - this - ): { - local filteringSelector = this.config.filteringSelector, - local groupLabels = this.config.groupLabels, - local instanceLabels = this.config.instanceLabels, - local root = self, - local varMetric = 'node_uname_info', - local variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=true) = - local chainVarProto(index, chainVar) = - var.query.new(chainVar.label) - + var.query.withDatasourceFromVariable(root.datasources.prometheus) - + var.query.queryTypes.withLabelValues( - chainVar.label, - '%s{%s}' % [varMetric, chainVar.chainSelector], - ) - + var.query.generalOptions.withLabel(utils.toSentenceCase(chainVar.label)) - + var.query.selectionOptions.withIncludeAll( - value=if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, - customAllValue='.+' - ) - + var.query.selectionOptions.withMulti( - if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, - ) - + var.query.refresh.onTime() - + var.query.withSort( - i=1, - type='alphabetical', - asc=true, - caseInsensitive=false - ); - std.mapWithIndex(chainVarProto, utils.chainLabels(groupLabels + instanceLabels, [filteringSelector])), - datasources: { - prometheus: - var.datasource.new('datasource', 'prometheus') - + var.datasource.generalOptions.withLabel('Data source') - + var.datasource.withRegex(''), - loki: - var.datasource.new('loki_datasource', 'loki') - + var.datasource.generalOptions.withLabel('Loki data source') - + var.datasource.withRegex('') - + var.datasource.generalOptions.showOnDashboard.withNothing(), - }, - // Use on dashboards where multiple entities can be selected, like fleet dashboards - multiInstance: - [root.datasources.prometheus] - + variablesFromLabels(groupLabels, instanceLabels, filteringSelector), - // Use on dashboards where only single entity can be selected - singleInstance: - [root.datasources.prometheus] - + variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=false), - - queriesSelector: - '%s,%s' % [ - filteringSelector, - utils.labelsToPromQLSelector(groupLabels + instanceLabels), - ], - } - + if this.config.enableLokiLogs then self.withLokiLogs(this) else {}, - withLokiLogs(this): { - multiInstance+: [this.grafana.variables.datasources.loki], - singleInstance+: [this.grafana.variables.datasources.loki], - }, -} diff --git a/docs/node-observ-lib/macos/config.libsonnet b/docs/node-observ-lib/macos/config.libsonnet deleted file mode 100644 index 49ea6ecc4a..0000000000 --- a/docs/node-observ-lib/macos/config.libsonnet +++ /dev/null @@ -1,59 +0,0 @@ -{ - - // any modular observability library should inlcude as inputs: - // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups - // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. - // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. - // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. - // 'uid' - UID to prefix all dashboards original uids - - filteringSelector: 'job="integrations/macos-node"', - groupLabels: ['job'], - instanceLabels: ['instance'], - dashboardNamePrefix: 'MacOS / ', - uid: 'darwin', - - dashboardTags: [self.uid], - - // Select the fstype for filesystem-related queries. If left - // empty, all filesystems are selected. If you have unusual - // filesystem you don't want to include in dashboards and - // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. - fsSelector: 'fstype!=""', - - // Select the mountpoint for filesystem-related queries. If left - // empty, all mountpoints are selected. For example if you have a - // special purpose tmpfs instance that has a fixed size and will - // always be 100% full, but you still want alerts and dashboards for - // other tmpfs instances, you can exclude those by mountpoint prefix - // like so: 'mountpoint!~"/var/lib/foo.*"'. - fsMountpointSelector: 'mountpoint!=""', - - // Select the device for disk-related queries. If left empty, all - // devices are selected. If you have unusual devices you don't - // want to include in dashboards and alerting, you can exclude - // them here, e.g. 'device!="tmpfs"'. - diskDeviceSelector: 'device!=""', - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', - - // Alerts to keep from node-observ-lib: - alertsMacKeep: [ - 'NodeFilesystemAlmostOutOfSpace', - 'NodeNetworkReceiveErrs', - 'NodeNetworkTransmitErrs', - 'NodeTextFileCollectorScrapeError', - 'NodeFilesystemFilesFillingUp', - 'NodeFilesystemAlmostOutOfFiles', - ], - // logs lib related - enableLokiLogs: true, - extraLogLabels: ['filename', 'sender'], - logsVolumeGroupBy: 'sender', - showLogsVolume: true, - logsFilteringSelector: self.filteringSelector, - logsExtraFilters: '', - - -} diff --git a/docs/node-observ-lib/macos/panels.libsonnet b/docs/node-observ-lib/macos/panels.libsonnet deleted file mode 100644 index e0cca3131d..0000000000 --- a/docs/node-observ-lib/macos/panels.libsonnet +++ /dev/null @@ -1,38 +0,0 @@ -local g = import '../g.libsonnet'; -local commonlib = import 'common-lib/common/main.libsonnet'; -local utils = commonlib.utils; -{ - new(this): - { - local t = this.grafana.targets, - local table = g.panel.table, - local fieldOverride = g.panel.table.fieldOverride, - local instanceLabel = this.config.instanceLabels[0], - - // override description and targets - memoryUsageTsBytes+: - g.panel.timeSeries.panelOptions.withDescription( - ||| - - Physical memory: Total amount of memory installed in this computer; - - App memory: Physical memory allocated by apps and system processes; - - Wired memory: Physical memory, containing data that cannot be compressed or swapped to disk; - - Compressed memory: Physical memory used to store a compressed version of data that has not been used recently; - - Swap used: Amount of compressed data temporarily moved to disk to make room in memory for more recently used data. - ||| - ) - + g.panel.timeSeries.queryOptions.withTargets([ - t.memoryUsedBytes, - t.memoryTotalBytes, - t.memoryAppBytes, - t.memoryWiredBytes, - t.memoryCompressedBytes, - t.memorySwapUsedBytes, - ]) - + commonlib.panels.generic.timeSeries.threshold.stylizeByRegexp('Physical memory'), - - //override reduceOption field to version - osInfo+: - g.panel.timeSeries.panelOptions.withTitle('OS version') - + { options+: { reduceOptions: { fields: '/^version$/' } } }, - }, -} diff --git a/docs/node-observ-lib/macos/targets.libsonnet b/docs/node-observ-lib/macos/targets.libsonnet deleted file mode 100644 index 25efc16572..0000000000 --- a/docs/node-observ-lib/macos/targets.libsonnet +++ /dev/null @@ -1,87 +0,0 @@ -local g = import '../g.libsonnet'; -local prometheusQuery = g.query.prometheus; -local lokiQuery = g.query.loki; - -{ - new(this): { - local variables = this.grafana.variables, - local config = this.config, - local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', - local lokiDatasource = '${' + variables.datasources.loki.name + '}', - - memoryTotalBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_total_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Physical memory'), - - memoryUsedBytes: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - node_memory_internal_bytes{%(queriesSelector)s} - - node_memory_purgeable_bytes{%(queriesSelector)s} + - node_memory_wired_bytes{%(queriesSelector)s} + - node_memory_compressed_bytes{%(queriesSelector)s} - ) - ||| % variables - ) - + prometheusQuery.withLegendFormat('Memory used'), - memoryAppBytes: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - node_memory_internal_bytes{%(queriesSelector)s} - - node_memory_purgeable_bytes{%(queriesSelector)s} - ) - ||| % variables - ) - + prometheusQuery.withLegendFormat('App memory'), - memoryWiredBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_wired_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Wired memory'), - memoryCompressedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_compressed_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Compressed memory'), - - memoryUsagePercent: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - ( - avg(node_memory_internal_bytes{%(queriesSelector)s}) - - avg(node_memory_purgeable_bytes{%(queriesSelector)s}) + - avg(node_memory_wired_bytes{%(queriesSelector)s}) + - avg(node_memory_compressed_bytes{%(queriesSelector)s}) - ) / - avg(node_memory_total_bytes{%(queriesSelector)s}) - ) - * - 100 - ||| - % variables, - ), - memorySwapTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_swap_total_bytes{%(queriesSelector)s}' % variables - ), - - memorySwapUsedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_swap_used_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Swap used'), - }, -} diff --git a/docs/node-observ-lib/mixin-mac.libsonnet b/docs/node-observ-lib/mixin-mac.libsonnet deleted file mode 100644 index d0b56adf12..0000000000 --- a/docs/node-observ-lib/mixin-mac.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local macoslib = import './macos/main.libsonnet'; -local macos = macoslib.new(); - -{ - grafanaDashboards+:: macos.grafana.dashboards, - prometheusAlerts+:: macos.prometheus.alerts, - prometheusRules+:: macos.prometheus.recordingRules, -} diff --git a/docs/node-observ-lib/mixin.libsonnet b/docs/node-observ-lib/mixin.libsonnet deleted file mode 100644 index 284f307dd4..0000000000 --- a/docs/node-observ-lib/mixin.libsonnet +++ /dev/null @@ -1,16 +0,0 @@ -local nodelib = import './linux/main.libsonnet'; -local linux = - nodelib.new() - + nodelib.withConfigMixin({ - filteringSelector: 'job=~".*node.*"', - groupLabels: ['job'], - instanceLabels: ['instance'], - dashboardNamePrefix: 'Node exporter / ', - dashboardTags: ['node-exporter-mixin'], - uid: 'node', - }); -{ - grafanaDashboards+:: linux.grafana.dashboards, - prometheusAlerts+:: linux.prometheus.alerts, - prometheusRules+:: linux.prometheus.recordingRules, -}