diff --git a/stx-metrics/monitoring/README.md b/stx-metrics/monitoring/README.md new file mode 100644 index 0000000..e2cba56 --- /dev/null +++ b/stx-metrics/monitoring/README.md @@ -0,0 +1,44 @@ +# StarlingX metrics + +This project setups the infrastructure to enable the monitoring of StarlingX systems. This is achieved using [Prometheus](https://prometheus.io) for data gathering, the [Prometheu's node_exporter](https://github.com/prometheus/node_exporter) for monitoring and Grafana for data presentations. + +## Installation + +### Requirements + +This project assumes that docker-compose is installed in your system. For more details on how to install docker-compose, [see here](https://docs.docker.com/compose/install/). + +The following python packages are required: `pyyaml` and `paramiko`. + +Also, it is expected that a set of StarlingX systems are configured and reachable by network. + +### Setup the infrastructure + +Three containers will be launch in this stage. + +1. __nginx_: This is used to serve files to the StarlingX systems. In some deployments these systems doesn't have Internet access, so all required files are downloaded and then exposed through this web service. +2. _prometheus_: This is the monitoring system that will retrieve data from remote data exporters. +3. _grafana_: The dashboard to present the data. Grafana will connect to Prometheus to get the data. + +Before starting the infrastructure you may want to configure Prometheus. In the `config/prometheus.yml` there is an example on how to configure prometheus. + +The `setup-infra.sh` script downloads the `node_exporter` and execute `docker-compose` to launch the containers. + +## Setup StarlingX systems. + +The `deployer.py` script is in charge of execute commands in the remote StarlingX systems. The `config.yaml` file is used to provide configuration to this script, here the user name, password, local IP and remote IP address should be detailed. + +To run this script just execute: + +``` +python3 deployer.py +``` + +## TODO + - [ ] Configure a storage backend for Prometheus. + - [ ] Find a way to tag data in the `node_exporter` using the `/etc/build.info` data. + - [ ] Identify needed queries for anomaly detection. + - [X] Create a basic docker compose file to launch prometheus and grafana. + - [X] Create a script to download the node exporter and install it in the target system. + - [X] Create a systemd file to configure the node exporter + - [ ] Find a way to auto configure grafana. diff --git a/stx-metrics/monitoring/config.yaml b/stx-metrics/monitoring/config.yaml new file mode 100644 index 0000000..825417d --- /dev/null +++ b/stx-metrics/monitoring/config.yaml @@ -0,0 +1,12 @@ +username: sysadmin +password: St4rlingX* +myip: 192.168.200.3 +hosts: + - 192.168.200.72 + - 192.168.200.76 + - 192.168.200.77 + - 192.168.200.82 + - 192.168.200.83 + - 192.168.200.84 + - 192.168.200.85 + diff --git a/stx-metrics/monitoring/config/nginx.conf b/stx-metrics/monitoring/config/nginx.conf new file mode 100644 index 0000000..11ad95f --- /dev/null +++ b/stx-metrics/monitoring/config/nginx.conf @@ -0,0 +1,29 @@ +master_process on; +pid /dev/null; +user root; +worker_processes 1; + +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + access_log /dev/stdout; + error_log /dev/stderr; + + server { + listen 80; + location / { + root /usr/share/nginx/html; + index index.html; + } + location /api { + alias /usr/share/nginx/html/files; + } + + autoindex on; + } +} + diff --git a/stx-metrics/monitoring/config/prometheus.yml b/stx-metrics/monitoring/config/prometheus.yml new file mode 100644 index 0000000..f24dd9c --- /dev/null +++ b/stx-metrics/monitoring/config/prometheus.yml @@ -0,0 +1,39 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + scrape_interval: 5s + static_configs: + - targets: ['localhost:9090'] +# From here add additional systems that you want to +# monitor. You'll to set the job_name and target IP +# address. + - job_name: 'simplex-controller-0' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.72:9100'] + - job_name: 'duplex-controller-0' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.76:9100'] + - job_name: 'duplex-controller-1' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.77:9100'] + - job_name: 'standard-controller-0' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.82:9100'] + - job_name: 'standard-controller-1' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.83:9100'] + - job_name: 'standard-ext-controller-0' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.84:9100'] + - job_name: 'standard-ext-controller-1' + scrape_interval: 5s + static_configs: + - targets: ['192.168.200.85:9100'] \ No newline at end of file diff --git a/stx-metrics/monitoring/deployer.py b/stx-metrics/monitoring/deployer.py new file mode 100644 index 0000000..ec50ac7 --- /dev/null +++ b/stx-metrics/monitoring/deployer.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +# +# SPDX-License-Identifier: Apache-2.0 +# + +import sys +import paramiko +import yaml + + +class SSHConnection: + """ Perform commands through SSH to a remote host.""" + def __init__(self, host, username, password): + self.host = host + self.username = username + self.password = password + self.ssh = paramiko.SSHClient() + self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + def open(self): + try: + self.ssh.connect(self.host, username=self.username, password=self.password) + except paramiko.AuthenticationException: + print("Authentication failure.") + sys.exit(1) + + def close(self): + self.ssh.close() + + def command(self, cmd, sudo=False): + if sudo: + cmd = "sudo -k -S -p '' {}".format(cmd) + + stdin, stdout, stderr = self.ssh.exec_command(cmd) + if sudo: + stdin.write(self.password + '\n') + stdin.flush() + while not stdout.channel.exit_status_ready(): + # May hang but we don't care right now. + continue + + out = stdout.readlines() + err = stderr.readline() + retcode = stdout.channel.recv_exit_status() + return (out, err, retcode) + +class Config: + + def load(self): + try: + with open('config.yaml') as f: + lines = f.read() + except FileNotFoundError: + print ("Cannot find configuration file") + raise + + data = yaml.load(lines, Loader=yaml.BaseLoader) + if not data: + print ("No data loaded from config file") + return + + self.username = data['username'] + self.password = data['password'] + self.hosts = data['hosts'] + self.myip = data['myip'] + + +def main(): + c = Config() + c.load() + for h in c.hosts: + ssh = SSHConnection(h, c.username, c.password) + ssh.open() + print ("Downloading node_exporter in {}".format(h)) + cmd = "curl -o /usr/bin/node_exporter http://{}:8787/node_exporter".format(c.myip) + out, err, retcode = ssh.command(cmd, sudo=True) + if retcode: + print ("Cannot download node_exporter") + + cmd = "chmod +x /usr/bin/node_exporter" + out, err, retcode = ssh.command(cmd, sudo=True) + + print ("Downloading node_exporter.service in {}".format(h)) + cmd = "curl -o /etc/systemd/system/node_exporter.service http://{}:8787/node_exporter.service".format(c.myip) + out, err, retcode = ssh.command(cmd, sudo=True) + if retcode: + print ("Cannot download node_exporter.service") + + print ("Downloading k8s policy") + cmd = "curl -o policy.yaml http://{}:8787/policy.yaml".format(c.myip) + out, err, retcode = ssh.command(cmd) + if retcode: + print ("Cannot download k8s policy: {}".format(err)) + + cmd = "systemctl daemon-reload" + out, err, retcode = ssh.command(cmd, sudo=True) + + print ("Enabling service in {}".format(h)) + cmd = "systemctl enable node_exporter.service" + out, err, retcode = ssh.command(cmd, sudo=True) + if retcode: + print ("Cannot enable service") + + print ("Starting service in {}".format(h)) + cmd = "systemctl start node_exporter.service" + out, err, retcode = ssh.command(cmd, sudo=True) + if retcode: + print ("Cannot start service") + + cmd = "KUBECONFIG=/etc/kubernetes/admin.conf kubectl apply -f policy.yaml" + out, err, retcode = ssh.command(cmd) + if retcode: + print ("Cannot apply policy: {}".format(err)) + + + ssh.close() + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/stx-metrics/monitoring/docker-compose.yml b/stx-metrics/monitoring/docker-compose.yml new file mode 100644 index 0000000..219ba8a --- /dev/null +++ b/stx-metrics/monitoring/docker-compose.yml @@ -0,0 +1,26 @@ +version: '3' + +services: + prometheus: + image: quay.io/prometheus/prometheus:v2.0.0 + volumes: + - ./config/prometheus.yml:/etc/prometheus/prometheus.yml + command: "--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus" + ports: + - 9090:9090 + environment: + - http_proxy= + - https_proxy= + grafana: + image: grafana/grafana + ports: + - 3000:3000 + depends_on: + - prometheus + fileserver: + image: nginx:alpine + volumes: + - ./config/nginx.conf:/etc/nginx/nginx.conf + - ./downloads:/usr/share/nginx/html + ports: + - "8787:80" diff --git a/stx-metrics/monitoring/downloads/node_exporter.service b/stx-metrics/monitoring/downloads/node_exporter.service new file mode 100644 index 0000000..b6a4bbc --- /dev/null +++ b/stx-metrics/monitoring/downloads/node_exporter.service @@ -0,0 +1,10 @@ +[Unit] +Description=Node Exporter +Wants=network-online.target +After=network-online.target + +[Service] +ExecStart=/usr/bin/node_exporter + +[Install] +WantedBy=default.target \ No newline at end of file diff --git a/stx-metrics/monitoring/downloads/policy.yaml b/stx-metrics/monitoring/downloads/policy.yaml new file mode 100644 index 0000000..1a380ef --- /dev/null +++ b/stx-metrics/monitoring/downloads/policy.yaml @@ -0,0 +1,22 @@ +apiVersion: "crd.projectcalico.org/v1" +kind: GlobalNetworkPolicy +metadata: + name: prom-node-exporter +spec: + selector: "has(iftype) && iftype == 'oam'" + order: 100 + applyOnForward: false + types: + - Ingress + - Egress + ingress: + - action: Allow + ipVersion: 4 + protocol: TCP + destination: + ports: [9100] + egress: + - action: Allow + ipVersion: 4 + protocol: TCP + diff --git a/stx-metrics/monitoring/setup-infra.sh b/stx-metrics/monitoring/setup-infra.sh new file mode 100755 index 0000000..c09408c --- /dev/null +++ b/stx-metrics/monitoring/setup-infra.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e + +echo "Downloading packages" + +pushd downloads +curl -L -# -o node_exporter-0.18.1.linux-amd64.tar.gz \ + https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz + +tar -zxf node_exporter-0.18.1.linux-amd64.tar.gz +mv node_exporter-0.18.1.linux-amd64/node_exporter . +rm node_exporter-0.18.1.linux-amd64.tar.gz +rm -rf node_exporter-0.18.1.linux-amd64 + +popd + +echo "Starting services" + +sudo docker-compose -f docker-compose.yml build +sudo docker-compose -f docker-compose.yml start