From 58c52b605c88188f289dcfefbc7180c8b0fe81a0 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 3 Nov 2022 16:39:53 +1000 Subject: [PATCH] Initial implementation of basic checks (#2) * feat: implemented checks for cpu, memory, disk, systemd service and timer * feat: implement consul report Add github config files. Add tests. --- .github/CODEOWNERS | 2 + .github/ISSUE_TEMPLATE/bug-report.yml | 48 +++ .github/ISSUE_TEMPLATE/feature-request.yml | 31 ++ .github/dependabot.yml | 24 ++ .github/labeler.yml | 11 + .github/release.yml | 18 + .github/workflows/code-analysis.yml | 57 +++ .github/workflows/labeler.yml | 21 ++ .github/workflows/test-package.yml | 59 ++++ CONTRIBUTING.md | 84 +++++ MANIFEST.in | 3 + README.md | 15 +- VERSION | 1 + docs/DEVELOP.md | 17 - pyproject.toml | 101 +++++- requirements-dev.txt | 35 ++ requirements.txt | 5 + setup.cfg | 47 +++ src/server_monitor_agent/__init__.py | 0 src/server_monitor_agent/agent/__init__.py | 0 src/server_monitor_agent/agent/cli.py | 362 +++++++++++++++++++ src/server_monitor_agent/agent/common.py | 166 +++++++++ src/server_monitor_agent/agent/consul.py | 130 +++++++ src/server_monitor_agent/agent/instance.py | 114 ++++++ src/server_monitor_agent/agent/monitor.py | 142 ++++++++ src/server_monitor_agent/agent/service.py | 383 +++++++++++++++++++++ src/server_monitor_agent/agent/slack.py | 8 + src/server_monitor_agent/entry.py | 63 ++++ tests/conftest.py | 21 ++ tests/test_cli.py | 251 ++++++++++++++ 30 files changed, 2178 insertions(+), 41 deletions(-) create mode 100644 .github/CODEOWNERS create mode 100644 .github/ISSUE_TEMPLATE/bug-report.yml create mode 100644 .github/ISSUE_TEMPLATE/feature-request.yml create mode 100644 .github/dependabot.yml create mode 100644 .github/labeler.yml create mode 100644 .github/release.yml create mode 100644 .github/workflows/code-analysis.yml create mode 100644 .github/workflows/labeler.yml create mode 100644 .github/workflows/test-package.yml create mode 100644 CONTRIBUTING.md create mode 100644 MANIFEST.in create mode 100644 VERSION delete mode 100644 docs/DEVELOP.md create mode 100644 requirements-dev.txt create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 src/server_monitor_agent/__init__.py create mode 100644 src/server_monitor_agent/agent/__init__.py create mode 100644 src/server_monitor_agent/agent/cli.py create mode 100644 src/server_monitor_agent/agent/common.py create mode 100644 src/server_monitor_agent/agent/consul.py create mode 100644 src/server_monitor_agent/agent/instance.py create mode 100644 src/server_monitor_agent/agent/monitor.py create mode 100644 src/server_monitor_agent/agent/service.py create mode 100644 src/server_monitor_agent/agent/slack.py create mode 100644 src/server_monitor_agent/entry.py create mode 100644 tests/conftest.py create mode 100644 tests/test_cli.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..c672d00 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# These owners will be the default owners for everything in the repo. +* @cofiem diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 0000000..a8dd4bc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,48 @@ +name: Bug Report +description: Create a report to help us improve +title: "[Bug]: " +labels: + - bug +assignees: + - cofiem +body: + - type: textarea + id: description + attributes: + label: Describe the problem + description: A summary of the problem you've seen. + placeholder: Tell us what you see! + value: "e.g. When the program is given these arguments, it does this action I don't want ..." + validations: + required: true + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce the behavior + description: Your step-by-step guide to help use reproduce the problem. + placeholder: Tell us what you see! + value: "e.g. Run using these arguments ... Look in at the output in this folder ..." + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected behavior + description: What do you expect to happen? + placeholder: Tell us what you want to see! + value: "e.g. Run using these arguments ... should do this ..." + validations: + required: true + - type: input + id: program-details + attributes: + label: Program Details + description: What operating system and what version of the program is being used? + placeholder: "e.g. OS: [e.g. iOS], Program version [e.g. 2.1]" + validations: + required: false + - type: markdown + attributes: + value: | + Please attach any screenshots or logging output to help explain your problem. + Thanks for taking the time to fill out this bug report! diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml new file mode 100644 index 0000000..7ce1ec1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -0,0 +1,31 @@ +name: Feature Request +description: Suggest an idea for this project +title: "[Feature]: " +labels: + - enhancement +assignees: + - cofiem +body: + - type: textarea + id: description + attributes: + label: Describe the feature + description: A summary of the new functionality you'd like + placeholder: Tell us what you want to see! + value: "e.g. It would be nice to be able to do this and get this output ..." + validations: + required: true + - type: textarea + id: alternative + attributes: + label: Are there other approaches? + description: Any alternative solutions or features you've considered + placeholder: Tell us what else could be done + value: "e.g. Your program could interact with this other program like this ..." + validations: + required: true + - type: markdown + attributes: + value: | + Please attach any screenshots or other files to help explain your feature. + Thanks for taking the time to fill out this feature request! diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..5fd9439 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,24 @@ +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + target-branch: "main" + open-pull-requests-limit: 3 + + # Maintain dependencies for pip packages + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + target-branch: "main" + commit-message: + prefix: "pip stable" + prefix-development: "pip dev" + open-pull-requests-limit: 3 diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000..3997662 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,11 @@ +documentation: + - docs/**/* + +dependencies: + - requirements.txt + - requirements-dev.txt + - pyproject.toml + - MANIFEST.in + +tests: + - tests/**/* diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000..3714950 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,18 @@ +# Automatically generate release notes +# .github/release.yml +# see https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes#configuring-automatically-generated-release-notes + +changelog: + categories: + - title: New Features + labels: + - enhancement + - title: Bug Fixes + labels: + - bug + - title: Documentation changes + labels: + - documentation + - title: Dependency updates + labels: + - dependencies diff --git a/.github/workflows/code-analysis.yml b/.github/workflows/code-analysis.yml new file mode 100644 index 0000000..aba4a44 --- /dev/null +++ b/.github/workflows/code-analysis.yml @@ -0,0 +1,57 @@ +name: Code Analysis + +on: + schedule: + - cron: '04 2 * * 4' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ "python" ] + python-version: [ "3.9", "3.10", "3.11" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install dependencies + run: | + echo "::group::Pip dependencies" + python -m pip install --upgrade pip setuptools wheel + python -m pip install --upgrade -r requirements-dev.txt -r requirements.txt + python -m pip install . + echo "::endgroup::" + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + setup-python-dependencies: false + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + + - name: Run pip audit + uses: pypa/gh-action-pip-audit@v1.0.0 + continue-on-error: true + with: + inputs: requirements.txt requirements-dev.txt diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000..ec3e96e --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,21 @@ +name: Labeler + +on: + pull_request_target: + types: [ "opened" ] + branches: [ "main" ] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +jobs: + label: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v4 + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml new file mode 100644 index 0000000..30a0f19 --- /dev/null +++ b/.github/workflows/test-package.yml @@ -0,0 +1,59 @@ +name: Test Package + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test_lint: + name: Test and lint + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [ "3.9", "3.10", "3.11" ] + permissions: + contents: read + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install dependencies + run: | + echo "::group::Pip dependencies" + python -m pip install --upgrade pip setuptools wheel + python -m pip install --upgrade -r requirements-dev.txt -r requirements.txt + echo "::endgroup::" + + - name: Run pytest coverage + if: matrix.python-version == '3.9' + run: | + echo "::group::Tests - Run tests with coverage" + ( + set -o pipefail + python -X dev -m pytest --doctest-modules \ + --junitxml=artifact-pytest-coverage.xml \ + --cov-report=term-missing:skip-covered --cov=src/ tests/ | tee artifact-pytest-coverage.txt + ) + echo "::endgroup::" + + # run tests using tox + # https://tox.wiki/en/latest/config.html#conf-basepython + # https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#running-tests-with-tox + - name: Run tox + run: | + python -X dev -m tox -e py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..6b8ec72 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,84 @@ +# Server monitor agent contributing guide + +## Development + +Create a virtual environment: + +```bash +python -m venv .venv +``` + +Install runtime dependencies and development dependencies: + +```bash +# Windows +.venv\Scripts\activate.ps1 + +# Linux +source .venv/bin/activate + +# install dependencies +python -m pip install --upgrade pip setuptools wheel +python -m pip install --upgrade -r requirements-dev.txt -r requirements.txt + +# check for outdated packages +pip list --outdated +``` + +## Create and upload release + +Generate the distribution package archives. + +```bash +python -X dev -m build +``` + +Upload archives to Test PyPI first. + +```bash +python -X dev -m twine upload --repository testpypi dist/* +``` + +When uploading: + +- for username, use `__token__` +- for password, create a token at https://test.pypi.org/manage/account/#api-tokens + +Go to the [test project page](https://test.pypi.org/project/server-monitor-agent) and check that it looks ok. + +Then create a new virtual environment, install the dependencies, and install from Test PyPI. + +```bash +python -m venv .venv-test +source .venv-test/bin/activate +python -m pip install --upgrade pip setuptools wheel +python -m pip install --upgrade -r requirements.txt + +SERVER_MONITOR_AGENT_VERSION='0.1.0' +pip install --index-url https://test.pypi.org/simple/ --no-deps server-monitor-agent==$SERVER_MONITOR_AGENT_VERSION +``` + +Test the installed package. + +```bash +server-monitor-agent --version +server-monitor-agent --help +server-monitor-agent memory +server-monitor-agent cpu +server-monitor-agent systemd-service +``` + +If the package seems to work as expected, upload it to the live PyPI. + +```bash +python -X dev -m twine upload dist/* +``` + +When uploading: + +- for username, use `__token__` +- for password, create a token at https://pypi.org/manage/account/#api-tokens + +Go to the [live project page](https://pypi.org/project/server-monitor-agent) and check that it looks ok. + +Done! diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..4ba9177 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include VERSION +include requirements.txt +include requirements-dev.txt diff --git a/README.md b/README.md index c0ad47f..7e19e8d 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,5 @@ # server-monitor-agent -A simple Python application for running checks on a server and sending formatted notifications. +A Python application for running checks on a server. This program is an agent that can be run on server instances. -It provides a number of commands to do common tasks. - -## Commands - -### Check - -Gather information about the instance and report the result via -exit code and json-formatted output. - -### Notify - -Send a message to an alerting service. -One message contains details about one service. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/docs/DEVELOP.md b/docs/DEVELOP.md deleted file mode 100644 index 0918723..0000000 --- a/docs/DEVELOP.md +++ /dev/null @@ -1,17 +0,0 @@ -# Development - -This document outlines the development process. - - -## Set up the local development environment - -TODO: -- Uses pipenv and Pipfile to manage Python packages? - - -## Build and publish a new release - -TODO: -- process? -- automated via github actions, circleci? -- Installed via a pip wheel or self-contained binary? diff --git a/pyproject.toml b/pyproject.toml index 43475c5..68dcfb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,19 +1,13 @@ [build-system] -requires = ["setuptools>=61.0"] +requires = [ + "setuptools>=63.0.0", +] build-backend = "setuptools.build_meta" [project] name = "server-monitor-agent" -version = "0.0.1" -authors = [ - { name="Mark" }, -] -maintainers = [ - { name="Mark" }, -] -description = "A simple Python application for running checks on a server and sending formatted notifications." +description = "Utility to run checks on a server and send notifications." readme = "README.md" -license = { file="LICENSE" } requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", @@ -23,7 +17,94 @@ classifiers = [ "Intended Audience :: System Administrators", "Topic :: System :: Monitoring", ] +dynamic = [ + "version", + "dependencies", + "optional-dependencies", +] [project.urls] "Homepage" = "https://github.com/qcif/server-monitor-agent" "Bug Tracker" = "https://github.com/qcif/server-monitor-agent/issues" + + +[project.scripts] +server-monitor-agent = 'server_monitor_agent.entry:main' + +[tool.setuptools.packages.find] +where = [ + "src", +] +# include and exclude accept strings representing glob patterns. +include = [ + "server_monitor_agent*", +] + +[tool.setuptools.dynamic] +version = { file = [ + "VERSION", +] } +dependencies = { file = [ + "requirements.txt", +] } + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = [ + "requirements-dev.txt", +] } + +[tool.pytest.ini_options] +minversion = "7.0" +addopts = "-ra --quiet" +pythonpath = [ + "src", +] +testpaths = [ + "tests", +] + +[tool.coverage.run] +# "Specifying the source option enables coverage.py to report on unexecuted files, +# since it can search the source tree for files that haven’t been measured at all." +source = [ + 'src', +] +omit = [ + '*/site-packages/*', + 'tests/*', +] + +[tool.coverage.report] +skip_empty = true + +[tool.coverage.html] +directory = "coverage-html" + +[tool.isort] +profile = "black" +src_paths = [ + "src", +] + +[tool.tox] +legacy_tox_ini = """ +[tox] +isolated_build = True +envlist = py39,py310,py311 + +[testenv] +#recreate = true +deps = + -r requirements.txt + -r requirements-dev.txt +commands = + server-monitor-agent --help + server-monitor-agent --version + coverage run -m pytest --tb=no --durations=5 +""" + +[tool.pydocstyle] +ignore = 'D104' + +[tool.mypy] +ignore_missing_imports = true diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c23e74f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,35 @@ +# package management +pip==22.2.2 + +# build and upload package +setuptools==65.3.0 +wheel==0.37.1 +build==0.8.0 +twine==4.0.1 + +# testing +pytest==7.1.3 +pytest-mock==3.8.2 +requests-mock==1.10.0 +tblib==1.7.0 +coverage==6.4.4 +tox==3.26.0 + +# linters +black==22.8.0 +flake8==5.0.4 +flake8-annotations-coverage==0.0.6 +flake8-black==0.3.3 +flake8-bugbear==22.9.11 +flake8-comprehensions==3.10.0 +flake8-unused-arguments==0.0.11 +mypy==0.971 +pylint==2.15.2 +pydocstyle==6.1.1 +pyright==1.1.271 +types-dateparser==1.1.4 +types-PyYAML==6.0.11 +types-requests==2.28.10 +types-backports==0.1.3 +types-urllib3==1.26.24 +pyre-check==0.9.15; platform_system != "Windows" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..82b67f2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests==2.28.1 +psutil==5.9.2 +dateparser==1.1.1 +humanize==4.4.0 +tzdata==2022.6 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a8c3630 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,47 @@ +[flake8] +exclude = + .tox, + __pycache__, + .pytest_cache/, + .venv/, + tests/, + build/, + dist/ +max-line-length = 88 +max-complexity = 15 +ignore = +# C901 '' is too complex () + C901 +# W503 line break before binary operator + W503 +# F841 local variable 'data' is assigned to but never used + F841 +# U100 Unused argument + U100 + +[pylama] +paths = src +skip = */__init__.py +linters = eradicate,mccabe,mypy,pycodestyle,pydocstyle,pyflakes,pylint,isort + +[pylama:pycodestyle] +ignore = D203,D202,D107 + +[pycodestyle] +ignore = +# 1 blank line required before class docstring + D203, +# No blank lines allowed after function docstring + D202, + D107 + +[pylama:isort] +profile = black +src_paths = src + +[isort] +profile = black +src_paths = src + +[pytype] +inputs = server_monitor_agent diff --git a/src/server_monitor_agent/__init__.py b/src/server_monitor_agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/server_monitor_agent/agent/__init__.py b/src/server_monitor_agent/agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/server_monitor_agent/agent/cli.py b/src/server_monitor_agent/agent/cli.py new file mode 100644 index 0000000..79fc5bf --- /dev/null +++ b/src/server_monitor_agent/agent/cli.py @@ -0,0 +1,362 @@ +import argparse +import os +import pathlib +import typing + +from server_monitor_agent.agent import consul, instance, monitor, service, common + + +def memory_usage_cli(args: argparse.Namespace): + time_zone = args.time_zone + threshold = args.threshold + return instance.memory_usage_detail(time_zone, threshold) + + +def cpu_usage_cli(args: argparse.Namespace): + time_zone = args.time_zone + threshold = args.threshold + interval = args.interval + return instance.cpu_usage_detail(time_zone, threshold, interval) + + +def disk_usage_cli(args: argparse.Namespace): + time_zone = args.time_zone + threshold = args.threshold + mount_path = args.mount_path + return instance.disk_usage_detail(time_zone, threshold, mount_path) + + +def systemd_service_cli(args: argparse.Namespace): + time_zone = args.time_zone + name = args.name + load_state = args.expected_load_state + active_state = args.expected_active_state + file_state = args.expected_file_state + sub_state = args.expected_sub_state + result_state = args.expected_result_state + max_age_hours = args.max_age_hours + return service.service_detail( + time_zone, + name, + load_state, + active_state, + file_state, + sub_state, + result_state, + max_age_hours, + ) + + +def systemd_timer_cli(args: argparse.Namespace): + time_zone = args.time_zone + name = args.name + load_state = args.expected_load_state + active_state = args.expected_active_state + file_state = args.expected_file_state + sub_state = args.expected_sub_state + result_state = args.expected_result_state + return service.timer_detail( + time_zone, name, load_state, active_state, file_state, sub_state, result_state + ) + + +def consul_report_cli(args: argparse.Namespace): + time_zone = args.time_zone + cloud_name = args.cloud_name + + # get consul connection info and slack url from env vars + http_addr = os.getenv("CONSUL_HTTP_ADDR") or None + http_ssl = os.getenv("CONSUL_HTTP_SSL") or None + http_ssl_verify = os.getenv("CONSUL_HTTP_SSL_VERIFY") or None + ca_cert = os.getenv("CONSUL_CACERT") or None + ca_path = os.getenv("CONSUL_CAPATH") or None + client_cert = os.getenv("CONSUL_CLIENT_CERT") or None + client_key = os.getenv("CONSUL_CLIENT_KEY") or None + conn = consul.ConsulConnection( + http_ssl_enabled=http_ssl == "true", + http_ssl_verify=http_ssl_verify == "true", + http_addr=http_addr, + ca_cert_file=pathlib.Path(ca_cert) if ca_cert else None, + ca_cert_dir=pathlib.Path(ca_path) if ca_path else None, + client_cert=pathlib.Path(client_cert) if client_cert else None, + client_key=pathlib.Path(client_key) if client_key else None, + ) + + slack_url = os.getenv("SLACK_WEBHOOK_URL_CONSUL") + return monitor.consul_checks_to_slack(time_zone, cloud_name, conn, slack_url) + + +def build(): + parser = argparse.ArgumentParser( + prog=common.APP_NAME_DASH, + description="Run a check on the local machine.", + allow_abbrev=False, + ) + + parser.add_argument( + "--version", action="version", version=f"%(prog)s {common.get_version()}" + ) + parser.add_argument( + "--debug", + action="store_true", + help="Turn on debug mode.", + ) + subparsers = parser.add_subparsers( + title="Available checks", + description="Specify the check command to run", + metavar="check_command", + dest="subparser_name", + ) + + # subparser: memory + parser_memory_usage = subparsers.add_parser( + "memory", + help="Check the current memory usage.", + ) + add_common_arguments(parser_memory_usage) + parser_memory_usage.add_argument( + "--threshold", + type=int, + default=80, + help="Warn over this percentage use (range 0 - 100, default 80).", + ) + parser_memory_usage.set_defaults(func=memory_usage_cli) + + # subparser: cpu + parser_cpu_usage = subparsers.add_parser( + "cpu", + help="Check the current cpu usage.", + ) + add_common_arguments(parser_cpu_usage) + parser_cpu_usage.add_argument( + "--threshold", + type=int, + default=80, + help="Warn over this percentage use (range 0 - 100, default 80).", + ) + parser_cpu_usage.add_argument( + "--interval", + type=float, + default=2, + help="The number of seconds to wait between CPU samples (range 0 - 10, default 2).", + ) + parser_cpu_usage.set_defaults(func=cpu_usage_cli) + + # subparser: disk + parser_disk_usage = subparsers.add_parser( + "disk", + help="Check the current free space for a disk.", + ) + add_common_arguments(parser_disk_usage) + parser_disk_usage.add_argument( + "--mount-path", + type=pathlib.Path, + default=pathlib.Path("/"), + help="The mount path of the disk to check (default '/').", + ) + parser_disk_usage.add_argument( + "--threshold", + type=int, + default=80, + help="Warn over this percentage used space (range 0 - 100, default 80).", + ) + parser_disk_usage.set_defaults(func=disk_usage_cli) + + # subparser: systemd service + parser_systemd_service = subparsers.add_parser( + "systemd-service", + help="Check the current status of a systemd service.", + ) + add_common_arguments(parser_systemd_service) + parser_systemd_service.add_argument( + "name", + help="The name of the service to check.", + ) + add_systemd_unit_arguments( + parser_systemd_service, + [ + "dead", + "condition", + "start-pre", + "start", + "start-post", + "running", + "exited", + "reload", + "stop", + "stop-watchdog", + "stop-sigterm", + "stop-sigkill", + "stop-post", + "final-sigterm", + "final-sigkill", + "failed", + "auto-restart", + "cleaning", + ], + [ + "start-pre", + "start", + "start-post", + "running", + "reload", + "auto-restart", + ], + ) + parser_systemd_service.add_argument( + "--expected-result-state", + action="extend", + nargs="+", + type=str, + default=list(["success"]), + choices=[ + "success", + "protocol", + "timeout", + "exit-code", + "signal", + "core-dump", + "watchdog", + "start-limit-hit", + "resources", + ], + help="The expected result status of the process controlled by this service " + "(default 'success').", + ) + parser_systemd_service.add_argument( + "--max-age-hours", + type=float, + help="The maximum allowed time since the service last changed state.", + ) + parser_systemd_service.set_defaults(func=systemd_service_cli) + + # subparser: systemd timer + parser_systemd_timer = subparsers.add_parser( + "systemd-timer", + help="Check the current status of a systemd timer.", + ) + add_common_arguments(parser_systemd_timer) + parser_systemd_timer.add_argument( + "name", + help="The name of the timer to check.", + ) + add_systemd_unit_arguments( + parser_systemd_timer, + [ + "dead", + "waiting", + "running", + "elapsed", + "failed", + ], + ["waiting", "running"], + ) + parser_systemd_timer.add_argument( + "--expected-result-state", + action="extend", + nargs="+", + type=str, + default=list(["success"]), + choices=[ + "success", + ], + help="The expected result status of the service controlled by this timer " + "(default 'success').", + ) + parser_systemd_timer.set_defaults(func=systemd_timer_cli) + + # subparser: consul report + parser_consul_report = subparsers.add_parser( + "consul-report", + help="Report the current state of all consul checks for this datacenter.", + ) + add_common_arguments(parser_consul_report) + parser_consul_report.add_argument( + "cloud_name", + help="The name of the cloud provider.", + ) + parser_consul_report.set_defaults(func=consul_report_cli) + + return parser + + +def add_common_arguments(parser): + parser.add_argument( + "--time_zone", + default="Australia/Brisbane", + help="The timezone to use for dates and times.", + ) + + +def add_systemd_unit_arguments( + parser, sub_state_options: typing.List[str], sub_state_default: typing.List[str] +): + # system stats - find the states using systemctl --state=help + + parser.add_argument( + "--expected-load-state", + action="extend", + nargs="+", + type=str, + default=list(["loaded"]), + choices=[ + "stub", + "loaded", + "not-found", + "bad-setting", + "error", + "merged", + "masked", + ], + help="The expected unit definition state " "(default 'loaded').", + ) + parser.add_argument( + "--expected-file-state", + action="extend", + nargs="+", + type=str, + default=list(["enabled", "enabled-runtime"]), + choices=[ + "enabled", + "enabled-runtime", + "linked", + "linked-runtime", + "masked", + "masked-runtime", + "static", + "disabled", + "indirect", + "generated", + "transient", + "bad", + ], + help="The expected unit file state " "(default 'enabled,enabled-runtime').", + ) + parser.add_argument( + "--expected-active-state", + action="extend", + nargs="+", + type=str, + default=list(["active", "reloading", "activating"]), + choices=[ + "active", + "reloading", + "inactive", + "failed", + "activating", + "deactivating", + "maintenance", + ], + help="The expected unit activation state " + "(default 'active,reloading,activating').", + ) + + parser.add_argument( + "--expected-sub-state", + action="extend", + nargs="+", + type=str, + default=list(sub_state_default), + choices=sub_state_options, + help=f"The expected sub-state (default '{sorted(sub_state_default)}').", + ) diff --git a/src/server_monitor_agent/agent/common.py b/src/server_monitor_agent/agent/common.py new file mode 100644 index 0000000..e20cb3b --- /dev/null +++ b/src/server_monitor_agent/agent/common.py @@ -0,0 +1,166 @@ +import abc +import dataclasses +import socket +import subprocess +import typing +from datetime import datetime +from importlib import metadata, resources +from zoneinfo import ZoneInfo + +APP_NAME_DASH = "server-monitor-agent" +APP_NAME_UNDER = "server_monitor_agent" + + +def get_hostname() -> str: + """Get the local hostname.""" + + result = socket.gethostname() + + # TODO: other options to get the hostname + # result = socket.getfqdn() + # result = platform.node() + + return result + + +def get_version() -> typing.Optional[str]: + """Get the version of this package.""" + try: + dist = metadata.distribution(APP_NAME_DASH) + return dist.version + except metadata.PackageNotFoundError: + # ignore error + pass + + try: + with resources.path(APP_NAME_UNDER, "entry.py") as p: + return (p.parent.parent.parent / "VERSION").read_text().strip() + except FileNotFoundError: + # ignore error + pass + + return "(version not available)" + + +def execute_process(args: typing.Sequence[str]): + """Execute a process using the given args.""" + try: + result = subprocess.run( + args, + capture_output=True, + shell=False, + timeout=10, + check=False, + text=True, + ) + # print(f"Process result: {result}", file=sys.stderr) + return result + except FileNotFoundError as e: + raise ValueError(f"Error running '{' '.join(args)}'") from e + + +@dataclasses.dataclass +class CheckReport(abc.ABC): + hostname: str + exit_code: int + time_zone: str + check_type: str + check_name: str + description: str + + def __post_init__(self): + self._timestamp_formatted = datetime.now(ZoneInfo(self.time_zone)).isoformat( + timespec="seconds" + ) + + @property + def timestamp_formatted(self) -> str: + return self._timestamp_formatted + + @property + def content_lines(self) -> typing.Sequence[str]: + raise NotImplementedError() + + @property + def content(self) -> str: + return "\n".join(self.content_lines) + + def __str__(self): + return ( + f"{'PASSING' if self.exit_code == 0 else 'PROBLEM'}: " + f"for '{self.check_name}' " + f"on '{self.hostname}' " + f"at {self.timestamp_formatted}" + ) + + +@dataclasses.dataclass +class CheckReportProblem(CheckReport): + impact: str + action: str + + @property + def content_lines(self) -> typing.Sequence[str]: + return [ + "----------------------", + f"⚠️🔴 *PROBLEM*: `{self.check_name}` on `{self.hostname}`", + f"at {self.timestamp_formatted} ({self.time_zone}) for {self.check_type}", + f"_Description_: {self.description}", + f"_Impact_: {self.impact}", + f"_Action_: {self.action}", + "----------------------", + ] + + +@dataclasses.dataclass +class CheckReportOk(CheckReport): + resolution: str + + @property + def content_lines(self) -> typing.Sequence[str]: + return [ + "----------------------", + f"✔️🟢 *PASSING*: `{self.check_name}` on `{self.hostname}`", + f"at {self.timestamp_formatted} ({self.time_zone}) for {self.check_type}", + f"_Description_: {self.description}", + f"_Resolution_: {self.resolution}", + "----------------------", + ] + + +def report_problem( + time_zone: str, + check_type: str, + check_name: str, + description: str, + impact: str, + action: str, +) -> CheckReportProblem: + hostname = get_hostname() + + return CheckReportProblem( + hostname=hostname, + exit_code=2, + time_zone=time_zone, + check_type=check_type, + check_name=check_name, + description=description, + impact=impact, + action=action, + ) + + +def report_ok( + time_zone: str, check_type: str, check_name: str, description: str, resolution: str +) -> CheckReportOk: + hostname = get_hostname() + + return CheckReportOk( + hostname=hostname, + exit_code=0, + time_zone=time_zone, + check_type=check_type, + check_name=check_name, + description=description, + resolution=resolution, + ) diff --git a/src/server_monitor_agent/agent/consul.py b/src/server_monitor_agent/agent/consul.py new file mode 100644 index 0000000..409dc0a --- /dev/null +++ b/src/server_monitor_agent/agent/consul.py @@ -0,0 +1,130 @@ +import dataclasses +import json +import pathlib +import typing + +import requests + +from server_monitor_agent.agent import common + + +@dataclasses.dataclass +class ConsulConnection: + http_ssl_enabled: bool = True + http_ssl_verify: bool = True + http_addr: str = "https://localhost:8501" + data_centre: typing.Optional[str] = None + ca_cert_file: typing.Optional[pathlib.Path] = None + ca_cert_dir: typing.Optional[pathlib.Path] = None + client_cert: typing.Optional[pathlib.Path] = None + client_key: typing.Optional[pathlib.Path] = None + + @property + def base_url(self): + return f"{self.http_addr}/v1" + + def validate(self): + if not self.http_addr: + raise ValueError("Consul settings are invalid: must provide http_addr.") + + if self.http_ssl_enabled and not self.http_addr.startswith("https"): + raise ValueError( + "Consul settings are inconsistent: ssl is enabled but http_addr does not start with 'https'." + ) + + if not self.http_ssl_enabled and self.http_addr.startswith("https"): + raise ValueError( + "Consul settings are inconsistent: ssl is disabled but http_addr starts with 'https'." + ) + + if self.client_cert and not self.client_cert.exists(): + raise ValueError( + f"Consul client cert file is specified but does not exist: {self.client_cert}." + ) + + if self.client_key and not self.client_key.exists(): + raise ValueError( + f"Consul client key file is specified but does not exist: {self.client_key}." + ) + + def api(self, path: str) -> requests.Response: + self.validate() + + if self.ca_cert_file and self.http_ssl_enabled: + verify = str(self.ca_cert_file) + else: + verify = self.http_ssl_verify + + if self.client_cert and self.client_key: + cert = (str(self.client_cert), str(self.client_key)) + else: + cert = None + + try: + req = requests.get(f"{self.base_url}/{path}", verify=verify, cert=cert) + except requests.RequestException as e: + raise ValueError(f"Consul http api {str(e)}") from e + + if req.status_code != 200: + raise ValueError(f"Consul http api error {req.status_code}: {req.text}") + + return req + + def cli(self, args: typing.List[str]): + self.validate() + + cmd_args = [ + "consul", + *args, + f"-http-addr={self.http_addr}", + ] + + if self.client_cert and self.client_key: + cmd_args.extend( + [ + f"-client-cert={str(self.client_cert)}", + f"-client-key={str(self.client_key)}", + ] + ) + + if self.ca_cert_dir: + cmd_args.append(f"-ca-path={str(self.ca_cert_dir)}") + if self.ca_cert_file: + cmd_args.append(f"-ca-file={str(self.ca_cert_file)}") + if self.data_centre: + cmd_args.append(f"-datacenter={self.data_centre}") + + result = common.execute_process(cmd_args) + + if result.returncode != 0 or result.stderr: + raise ValueError(f"Consul cli error: {result}") + + return result + + +def consul_cli_watch_checks_any(conn: ConsulConnection) -> typing.List[typing.Dict]: + args = [ + "watch", + "-type=checks", + "-state=any", + ] + result = conn.cli(args) + return json.loads(result.stdout) + + +def consul_api_health_checks_any(conn: ConsulConnection) -> typing.List[typing.Dict]: + req = conn.api(f"health/state/any") + items = req.json() + return items + + +def consul_api_status_leader(conn: ConsulConnection) -> str: + req = conn.api("status/leader") + return req.text + + +def aws_instance_private_ipv4() -> str: + req = requests.get("http://169.254.169.254/latest/meta-data/local-ipv4") + if req.status_code != 200: + raise ValueError(f"AWS instance metadata error {req.status_code}: {req.text}") + return req.text diff --git a/src/server_monitor_agent/agent/instance.py b/src/server_monitor_agent/agent/instance.py new file mode 100644 index 0000000..557a0d1 --- /dev/null +++ b/src/server_monitor_agent/agent/instance.py @@ -0,0 +1,114 @@ +import pathlib + +import psutil + +from server_monitor_agent.agent import common + + +def memory_usage_detail(time_zone: str, threshold: int) -> common.CheckReport: + """Get the memory information.""" + result = psutil.virtual_memory().percent + + if result > threshold: + return common.report_problem( + time_zone=time_zone, + check_type="memory", + check_name="memory", + description=f"Memory usage is too high ({result}% is over {threshold}%).", + impact="The server may become slow or unresponsive.", + action="Check for processes using excessive memory and " + "determine why the processes are behaving unexpectedly.", + ) + else: + return common.report_ok( + time_zone=time_zone, + check_type="memory", + check_name="memory", + description=f"Memory usage was too high (over {threshold}%, now {result}%).", + resolution="Memory usage has reduced below the threshold.", + ) + + +def cpu_usage_detail( + time_zone: str, threshold: int, interval: float +) -> common.CheckReport: + """Get the cpu usage.""" + result = psutil.cpu_percent(interval=interval) + + if result > threshold: + return common.report_problem( + time_zone=time_zone, + check_type="cpu", + check_name="cpu", + description="Total CPU usage is too high " + f"({result}% is over {threshold}%).", + impact="The server may become slow or unresponsive.", + action="Check for processes using excessive CPU and " + "determine why the processes are behaving unexpectedly.", + ) + else: + return common.report_ok( + time_zone=time_zone, + check_type="cpu", + check_name="cpu", + description="Total CPU usage was too high " + f"(over {threshold}%, now {result}%).", + resolution="Total CPU usage has reduced below the threshold.", + ) + + +def disk_usage(mount_path: pathlib.Path) -> float: + """Get the usage of the disk mounted at the given path.""" + keys = ["source", "fstype", "size", "used", "avail", "pcent", "file", "target"] + args = [ + "df", + "--exclude-type=devtmpfs", + "--exclude-type=tmpfs", + "--exclude-type=squashfs", + f"--output={','.join(keys)}", + ] + result = common.execute_process(args) + lines = result.stdout.splitlines()[1:] + data = [dict(zip(keys, i.split())) for i in lines] + match_data = [i for i in data if i["target"] == str(mount_path)] + + if len(match_data) != 1: + matched_targets = ", ".join(sorted([i["target"] for i in match_data])) + available_targets = ", ".join(sorted([i["target"] for i in data])) + raise ValueError( + f"Cannot match mount point '{str(mount_path)}' to one mount. \n" + f"It matched {len(match_data)}: '{matched_targets}'. \n" + f"Available mounts: '{available_targets}'." + ) + disk_size = int(match_data[0]["size"]) + disk_used = int(match_data[0]["used"]) + percent = round((disk_used / disk_size) * 100.0, 2) + return percent + + +def disk_usage_detail( + time_zone: str, threshold: int, mount_path: pathlib.Path +) -> common.CheckReport: + result = disk_usage(mount_path=mount_path) + + if result > threshold: + return common.report_problem( + time_zone=time_zone, + check_type="disk", + check_name="disk", + description=f"Disk used space for '{mount_path}' is too high " + f"({result}% is over {threshold}%).", + impact="There may not be enough space for normal operation. " + "The disk may fill and cause the server serious issues.", + action="Check for unexpected files, such as logs or exception records, " + "and archive some files to create space.", + ) + else: + return common.report_ok( + time_zone=time_zone, + check_type="disk", + check_name="disk", + description=f"Disk used space for '{mount_path}' was too high " + f"(over {threshold}%, now {result}%).", + resolution="There is now enough disk free space.", + ) diff --git a/src/server_monitor_agent/agent/monitor.py b/src/server_monitor_agent/agent/monitor.py new file mode 100644 index 0000000..020a169 --- /dev/null +++ b/src/server_monitor_agent/agent/monitor.py @@ -0,0 +1,142 @@ +from datetime import datetime +from zoneinfo import ZoneInfo + +from server_monitor_agent.agent import common, consul, slack + + +def consul_checks_to_slack( + time_zone: str, cloud_name: str, conn: consul.ConsulConnection, slack_url: str +): + slack_items, entries = consul_check_report(time_zone, cloud_name, conn) + slack_text = "\n".join(slack_items) + + consul_leader_ipv4_port = consul.consul_api_status_leader(conn) + instance_ipv4 = consul.aws_instance_private_ipv4() + + is_leader = consul_leader_ipv4_port.startswith(instance_ipv4) + + if is_leader: + if slack_url and slack_text: + slack.slack_webhook(slack_url, slack_text) + else: + raise ValueError(f"Invalid slack url '{slack_url or ''}' or text.") + + return common.report_ok( + time_zone=time_zone, + check_type="consul-report", + check_name="consul-report", + description=slack_text, + resolution="Consul report is successfully generated.", + ) + + +def consul_check_report(time_zone: str, cloud_name: str, conn: consul.ConsulConnection): + # checks_cli = consul.consul_cli_watch_checks_any(conn) + checks_api = consul.consul_api_health_checks_any(conn) + checks_sorted = sorted( + checks_api, + key=lambda x: sort_checks(x.get("Node"), x.get("ServiceName"), x.get("Name")), + ) + + report_date = datetime.now(ZoneInfo(time_zone)).strftime( + "%a, %d %b %Y at %H:%M:%S %z" + ) + + passing = "passing" + ok = "ok" + error = "error" + + ok_nodes = 0 + error_nodes = 0 + + checks = {} + for check in checks_sorted: + node = check.get("Node") + # check_id = check.get("CheckID") + name = check.get("Name") + status = check.get("Status") + # notes = check.get("Notes") + # output = check.get("Output") + # service_id = check.get("ServiceID") + service_name = check.get("ServiceName") + + if node not in checks: + checks[node] = {ok: {}, error: {}} + + check_status = ok if status == passing else error + + if service_name not in checks[node][check_status]: + checks[node][check_status][service_name] = [] + + checks[node][check_status][service_name].append(name) + + entries = [] + for node, node_data in checks.items(): + ok_items = node_data[ok] + ok_count = len(ok_items) + + error_items = node_data[error] + error_count = len(error_items) + + if error_count < 1: + ok_nodes += 1 + continue + + error_nodes += 1 + + node1, node2 = node.split(".", maxsplit=1) + entries.append( + f"-> *{node1}*.{node2} ({error}: {error_count}, {ok}: {ok_count})" + ) + + services_checks = sorted(error_items.items()) + if len(services_checks) > 5: + rest = services_checks[5:] + rest_service_count = len([k for k, v in rest]) + rest_checks = [i for k, v in rest for i in v] + services_checks = services_checks[0:5] + services_checks.append( + (f"...and {rest_service_count} more services", rest_checks) + ) + + for service, service_checks in services_checks: + service_check = ( + ",".join(sorted(service_checks)) + if len(service_checks) < 4 + else f"{len(service_checks)} checks" + ) + entries.append(f" - {service or '(instance)'}: {service_check}") + + total_nodes = ok_nodes + error_nodes + percent_error = round((error_nodes / total_nodes) * 100.0, 1) + + slack_items = [ + f"*{cloud_name}* Consul Daily Error Report {report_date}", + f"There are {total_nodes} instances, " + f"{error_nodes} have errors ({percent_error}%).", + "", + "These service checks are in a _critical_ or _warning_ state:", + "```", + *entries, + "---", + "```", + ] + return slack_items, entries + + +def sort_checks(node: str, service: str, check: str): + key = [] + + if "consul" in node: + key.append("01") + elif "prod" in node: + key.append("02") + elif "test" in node: + key.append("03") + else: + key.append("04") + + key.append(node) + key.append(service) + key.append(check) + return "-".join(key) diff --git a/src/server_monitor_agent/agent/service.py b/src/server_monitor_agent/agent/service.py new file mode 100644 index 0000000..0e69e1e --- /dev/null +++ b/src/server_monitor_agent/agent/service.py @@ -0,0 +1,383 @@ +import dataclasses +import pathlib +import typing +from datetime import datetime, timedelta +from zoneinfo import ZoneInfo + +import dateparser +import humanize + +from server_monitor_agent.agent import common + +# TODO: put consul user into 'systemd-journal' group. + + +def service_detail( + time_zone: str, + name: str, + load_state: list[str], + active_state: list[str], + file_state: list[str], + sub_state: list[str], + result_state: list[str], + max_age_hours: int, +): + info = get_systemd_service_status( + time_zone, + name, + ".service", + load_state, + active_state, + file_state, + sub_state, + result_state, + max_age_hours, + ) + + if info.check_status: + return common.report_ok( + time_zone=time_zone, + check_type="systemd-service", + check_name=info.unit_name, + description=" \n".join( + [f"Service '{info.unit_name}' is as expected."] + info.ok_lines + [""] + ), + resolution="The service has been set to the expected state.", + ) + else: + return common.report_problem( + time_zone=time_zone, + check_type="systemd-service", + check_name=info.unit_name, + description=" \n".join( + [f"Service '{info.unit_name}' is in an unexpected state."] + + info.problem_lines + + [""], + ), + impact="A service that is not in the expected state " + "might cause degraded service. " + "For example, backups might not be created, " + "or database maintenance may not be not performed.", + action="Check the service status using " + f"'sudo systemctl status {info.unit_name}' " + "and 'sudo journalctl --no-hostname --no-pager " + f"-u {info.unit_name} | less +G'. " + "Determine the problem with the service process and fix it.", + ) + + +def timer_detail( + time_zone: str, + name: str, + load_state: list[str], + active_state: list[str], + file_state: list[str], + sub_state: list[str], + result_state: list[str], +): + info = get_systemd_service_status( + time_zone, + name, + ".timer", + load_state, + active_state, + file_state, + sub_state, + result_state, + None, + ) + + time_prev_diff_str, timestamp_prev_str = timestamp_info(info, "LastTriggerUSec") + prev_trigger = ( + f"The previous run was at '{timestamp_prev_str}' ({time_prev_diff_str})." + ) + + time_next_diff_str, timestamp_next_str = timestamp_info( + info, "NextElapseUSecRealtime" + ) + next_trigger = f"The next run is at '{timestamp_next_str}' ({time_next_diff_str})." + + if info.check_status: + return common.report_ok( + time_zone=time_zone, + check_type="systemd-timer", + check_name=info.unit_name, + description=" \n".join( + [ + f"Timer '{info.unit_name}' is as expected.", + prev_trigger, + next_trigger, + ] + + info.ok_lines + + [""] + ), + resolution="The timer has been set to the expected state.", + ) + else: + return common.report_problem( + time_zone=time_zone, + check_type="systemd-timer", + check_name=info.unit_name, + description=" \n".join( + [ + f"Timer '{info.unit_name}' is in an unexpected state.", + prev_trigger, + next_trigger, + ] + + info.problem_lines + + [""], + ), + impact="A timer not in the expected state could mean that scheduled tasks do not run.", + action=f"Check the timer status using 'systemctl status {info.unit_name}'. " + f"Determine whether the time should be running or not.", + ) + + +def timestamp_unit( + data: dict, +) -> typing.Tuple[typing.Optional[str], typing.Optional[datetime]]: + actual_active_state = data.get("ActiveState") + + if actual_active_state in ["active", "reloading"]: + timestamp = data.get("ActiveEnterTimestamp") + elif actual_active_state in ["inactive", "failed"]: + timestamp = data.get("InactiveEnterTimestamp") + elif actual_active_state in ["activating"]: + timestamp = data.get("InactiveExitTimestamp") + else: + timestamp = data.get("ActiveExitTimestamp") + + if timestamp: + timestamp_date = dateparser.parse(timestamp.strip()) + return timestamp, timestamp_date + else: + return None, None + + +def timestamp_diff( + timestamp: datetime, now: datetime +) -> typing.Tuple[str, typing.Optional[timedelta]]: + if timestamp: + time_diff = now - timestamp + time_diff_str = humanize.naturaltime(time_diff) + else: + time_diff = None + time_diff_str = None + + return time_diff_str, time_diff + + +def timestamp_info(info: "SystemdServiceInfo", key: str) -> typing.Tuple[str, str]: + ts_str = info.data.get(key) + if ts_str: + ts = dateparser.parse(ts_str.strip()) + diff_str, diff_ts = timestamp_diff(ts, info.timestamp_now) + else: + ts = SystemdServiceInfo.not_avail() + diff_str = SystemdServiceInfo.not_avail() + + return diff_str, ts_str + + +@dataclasses.dataclass +class SystemdServiceInfo: + time_zone: str + name: str + load_state: list[str] + active_state: list[str] + file_state: list[str] + sub_state: list[str] + result_state: list[str] + max_age_hours: int + + unit_name: str + data: dict + expected_load_state: list[str] + actual_load_state: str + match_load_state: bool + + expected_active_state: list[str] + actual_active_state: str + match_active_state: bool + + expected_file_state: list[str] + actual_file_state: str + match_file_state: bool + + expected_sub_state: list[str] + actual_sub_state: str + match_sub_state: bool + + expected_result_state: list[str] + actual_result_state: str + match_result_state: bool + + timestamp_last_change_str: str + timestamp_last_change: datetime + timestamp_now: datetime + timespan_str: str + timespan: timedelta + + logs: list[str] + + @classmethod + def not_avail(cls): + return "not available" + + @property + def problem_lines(self) -> list[str]: + # TODO: ExecMainCode,ExecMainStatus,StatusErrno should be compared to expected codes + data_keys = ["ExecMainCode", "ExecMainStatus", "StatusErrno"] + result_status_items = ",".join( + [f"{k}={self.data.get(k)}" for k in data_keys if self.data.get(k)] + ) + descr_lines = [ + f"State last changed '{self.timestamp_last_change_str}' " + f"({self.timespan_str}).", + f"Unit is '{self.actual_load_state}' (expected {self.expected_load_state}).", + f"File is '{self.actual_file_state}' (expected {self.expected_file_state}).", + f"Active state is '{self.actual_active_state}' (expected {self.expected_active_state}).", + f"Sub state is '{self.actual_sub_state}' (expected {self.expected_sub_state}).", + f"Result is '{self.actual_result_state}' (expected {self.expected_result_state}).", + f"Additional properties are {result_status_items}.", + "", + "Logs:", + ] + self.logs + return descr_lines + + @property + def ok_lines(self) -> list[str]: + descr_lines = [ + f"State last changed '{self.timestamp_last_change_str}' " + f"({self.timespan_str}).", + f"Unit is '{self.actual_load_state}' " + f"and file is '{self.actual_file_state}'.", + f"Active state is '{self.actual_active_state}' " + f"and sub state is '{self.actual_sub_state}'.", + f"Result is '{self.actual_result_state}'.", + ] + return descr_lines + + @property + def check_status(self): + return all( + [ + self.match_load_state, + self.match_active_state, + self.match_file_state, + self.match_sub_state, + self.match_result_state, + ] + ) + + +def get_systemd_service_status( + time_zone: str, + name: str, + expected_suffix: str, + load_state: list[str], + active_state: list[str], + file_state: list[str], + sub_state: list[str], + result_state: list[str], + max_age_hours: typing.Optional[int], +): + suffix = pathlib.Path(name).suffix + if suffix and suffix != expected_suffix: + raise ValueError( + f"Invalid systemd service name '{name}' " + "(must have no suffix or .service suffix)." + ) + unit_name = name if name.endswith(expected_suffix) else f"{name}{expected_suffix}" + args = ["systemctl", "show", "--no-pager", unit_name, "--all"] + result = common.execute_process(args) + data = dict(i.split("=", maxsplit=1) for i in result.stdout.splitlines()) + + actual_load_state = data.get("LoadState") + match_load_state = actual_load_state in load_state + + actual_active_state = data.get("ActiveState") + match_active_state = actual_active_state in active_state + + actual_file_state = data.get("UnitFileState") + match_file_state = actual_file_state in file_state + + actual_sub_state = data.get("SubState") + match_sub_state = actual_sub_state in sub_state + + actual_result_state = data.get("Result") + match_result_state = ( + (actual_result_state in result_state) + if (actual_result_state and result_state) + else True + ) + + # date time + datetime_now = datetime.now(ZoneInfo(time_zone)) + timestamp_str, timestamp_date = timestamp_unit(data) + + if timestamp_date: + time_diff_str, time_diff = timestamp_diff(timestamp_date, datetime_now) + else: + timestamp_date = None + timestamp_str = SystemdServiceInfo.not_avail() + time_diff = None + time_diff_str = SystemdServiceInfo.not_avail() + + log_args = [ + "journalctl", + "--no-hostname", + "--no-pager", + "-u", + unit_name, + "-n", + "7", + ] + log_result = common.execute_process(log_args) + log_lines = log_result.stdout.splitlines() + + # NOTES: + # the c code that creates the display for `systemctl status`: + # https://github.com/systemd/systemd/blob/main/src/systemctl/systemctl-show.c#L315 + + # timestamp = STRPTR_IN_SET(i->active_state, "active", "reloading") ? i->active_enter_timestamp : + # STRPTR_IN_SET(i->active_state, "inactive", "failed") ? i->inactive_enter_timestamp : + # STRPTR_IN_SET(i->active_state, "activating") ? i->inactive_exit_timestamp : + # i->active_exit_timestamp; + + # dict([(k,v) for k, v in data.items() if ('2022' in v or 'stamp' in k) and 'Monotonic' not in k]) + # dict([(k,v) for k, v in data.items() if 'Result' in k]) + + return SystemdServiceInfo( + time_zone=time_zone, + name=name, + load_state=load_state, + active_state=active_state, + file_state=file_state, + sub_state=sub_state, + result_state=result_state, + max_age_hours=max_age_hours, + unit_name=unit_name, + data=data, + actual_load_state=actual_load_state, + match_load_state=match_load_state, + actual_active_state=actual_active_state, + match_active_state=match_active_state, + actual_file_state=actual_file_state, + match_file_state=match_file_state, + actual_sub_state=actual_sub_state, + match_sub_state=match_sub_state, + actual_result_state=actual_result_state, + match_result_state=match_result_state, + expected_load_state=load_state, + expected_active_state=active_state, + expected_file_state=file_state, + expected_sub_state=sub_state, + expected_result_state=result_state, + timestamp_last_change_str=timestamp_str, + timestamp_last_change=timestamp_date, + timestamp_now=datetime_now, + timespan_str=time_diff_str, + timespan=time_diff, + logs=log_lines, + ) diff --git a/src/server_monitor_agent/agent/slack.py b/src/server_monitor_agent/agent/slack.py new file mode 100644 index 0000000..c0e924d --- /dev/null +++ b/src/server_monitor_agent/agent/slack.py @@ -0,0 +1,8 @@ +import requests + + +def slack_webhook(url: str, text: str): + req = requests.post(url, json={"text": text}) + if req.status_code != 200: + raise ValueError(f"Slack webhook post error {req.status_code}: {req.text}") + return req diff --git a/src/server_monitor_agent/entry.py b/src/server_monitor_agent/entry.py new file mode 100644 index 0000000..677f5c5 --- /dev/null +++ b/src/server_monitor_agent/entry.py @@ -0,0 +1,63 @@ +"""The command line entry module.""" + +import logging +import sys +import typing + +from server_monitor_agent.agent import cli, common + + +def main(args: typing.Optional[typing.List[str]] = None) -> int: + """Run as a command line program. + + Args: + args: The program arguments. + Returns: + int: Program exit code. + """ + if args is None: + args = sys.argv[1:] + + logging.basicConfig( + format="%(asctime)s [%(levelname)-8s] %(message)s", + datefmt="%a %d %b %H:%M:%S", + level=logging.INFO, + ) + if "--debug" in sys.argv: + logging.getLogger().setLevel(logging.DEBUG) + + cli_parser = cli.build() + + args = cli_parser.parse_args(args) + + if not hasattr(args, "func"): + cli_parser.print_help(file=sys.stderr) + return 1 + + if args.debug: + result: common.CheckReport = args.func(args) + + else: + try: + result: common.CheckReport = args.func(args) + except Exception as e: + print( + f"Error running check '{args.subparser_name}' - " + f"'{e.__class__.__name__}': \"{str(e)}\"", + file=sys.stderr, + ) + # Exit code 1 is treated as 'warning' by consul. + return 1 + + if result.exit_code == 0: + print(result.content, file=sys.stdout) + else: + print(result.content, file=sys.stderr) + + return result.exit_code + + +if __name__ == "__main__": + # python convention is to call sys.exit + # only if this file is run as the 'top-level code environment'. + sys.exit(main()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c4bcaa7 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,21 @@ +import pytest + + +@pytest.fixture(autouse=True) +def no_run_cmd(monkeypatch): + """Remove method that allows running command for all tests.""" + + def run_cmd(self, args): + raise ValueError(f"Must set execute_process method for '{args}'.") + + monkeypatch.setattr("server_monitor_agent.agent.common.execute_process", run_cmd) + + +@pytest.fixture(autouse=True) +def no_requests(monkeypatch): + """Remove requests.sessions.Session.request for all tests.""" + + def run_cmd(self, args): + raise ValueError(f"Must set request method for '{args}'.") + + monkeypatch.setattr("requests.sessions.Session.request", run_cmd) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..ca125b3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,251 @@ +import io +import json +import os +import pathlib +import subprocess +import sys + +import pytest +import requests + +from server_monitor_agent.agent import common +from server_monitor_agent.entry import main + +expected_version = "0.1.0" + +if sys.version_info.minor >= 10: + help_phrase_options = "options:" +else: + help_phrase_options = "optional arguments:" + + +PROG_HELP = ( + "usage: server-monitor-agent [-h] [--version] [--debug] check_command ...\n" + "\n" + "Run a check on the local machine.\n" + "\n" + f"{help_phrase_options}\n" + " -h, --help show this help message and exit\n" + " --version show program's version number and exit\n" + " --debug Turn on debug mode.\n" + "\n" + "Available checks:\n" + " Specify the check command to run\n" + "\n" + " check_command\n" + " memory Check the current memory usage.\n" + " cpu Check the current cpu usage.\n" + " disk Check the current free space for a disk.\n" + " systemd-service\n" + " Check the current status of a systemd service.\n" + " systemd-timer Check the current status of a systemd timer.\n" + " consul-report Report the current state of all consul checks for this\n" + " datacenter.\n" +) + + +def test_cli_no_args(capsys, caplog): + actual_exit_code = main([]) + stdout, stderr = capsys.readouterr() + assert stdout == "" + assert stderr == PROG_HELP + assert caplog.record_tuples == [] + + assert actual_exit_code == 1 + + +def test_cli_help(capsys, caplog): + with pytest.raises(SystemExit, match="0"): + main(["--help"]) + + stdout, stderr = capsys.readouterr() + assert stdout == PROG_HELP + assert stderr == "" + assert caplog.record_tuples == [] + + +def test_cli_version(capsys, caplog): + with pytest.raises(SystemExit, match="0"): + main(["--version"]) + + stdout, stderr = capsys.readouterr() + assert stdout == f"{common.APP_NAME_DASH} {expected_version}\n" + assert stderr == "" + assert caplog.record_tuples == [] + + +def test_cli_memory(capsys, caplog): + actual_exit_code = main(["memory"]) + + stdout, stderr = capsys.readouterr() + + assert actual_exit_code in [0, 2] + if actual_exit_code == 0: + assert "*PASSING*: `memory` on `" in stdout + assert stderr == "" + else: + assert stdout == "" + assert "*PROBLEM*: `memory` on `" in stderr + + assert caplog.record_tuples == [] + + +def test_cli_cpu(capsys, caplog): + actual_exit_code = main(["cpu", "--interval", "0.5"]) + + stdout, stderr = capsys.readouterr() + + assert actual_exit_code in [0, 2] + if actual_exit_code == 0: + assert "*PASSING*: `cpu` on `" in stdout + assert stderr == "" + else: + assert stdout == "" + assert "*PROBLEM*: `cpu` on `" in stderr + + assert caplog.record_tuples == [] + + +def test_cli_systemd_service_help(capsys, caplog): + with pytest.raises(SystemExit, match="0"): + main(["systemd-service", "--help"]) + + stdout, stderr = capsys.readouterr() + assert "usage: server-monitor-agent systemd-service" in stdout + assert stderr == "" + assert caplog.record_tuples == [] + + +def test_cli_systemd_service_ssh_check(capsys, caplog, monkeypatch): + + data = [ + "StatusErrno=0", + "Result=success", + "ExecMainStartTimestamp=Mon 2022-10-31 15:30:10 AEST", + "ExecMainCode=0", + "ExecMainStatus=0", + "LoadState=loaded", + "ActiveState=active", + "SubState=running", + "UnitFileState=enabled", + "UnitFilePreset=enabled", + "StateChangeTimestamp=Mon 2022-10-31 15:30:10 AEST", + "InactiveExitTimestamp=Mon 2022-10-31 15:30:10 AEST", + "ActiveEnterTimestamp=Mon 2022-10-31 15:30:10 AEST", + "ConditionTimestamp=Mon 2022-10-31 15:30:10 AEST", + "AssertTimestamp=Mon 2022-10-31 15:30:10 AEST", + ] + + with monkeypatch.context() as m: + + def execute_process(*args, **kwargs): + return subprocess.CompletedProcess( + args=args, returncode=0, stdout="\n".join(data), stderr="" + ) + + m.setattr("server_monitor_agent.agent.common.execute_process", execute_process) + actual_exit_code = main(["systemd-service", "ssh"]) + + stdout, stderr = capsys.readouterr() + assert "*PASSING*: `ssh.service` on `" in stdout + assert stderr == "" + assert caplog.record_tuples == [] + assert actual_exit_code == 0 + + +def test_cli_consul_resport_aws(capsys, caplog, monkeypatch, tmp_path): + + os.environ["CONSUL_HTTP_ADDR"] = "https://localhost:8501" + os.environ["CONSUL_HTTP_SSL"] = "true" + os.environ["CONSUL_HTTP_SSL_VERIFY"] = "true" + os.environ["CONSUL_CACERT"] = str(tmp_path / "ca_cert") + os.environ["CONSUL_CAPATH"] = str(tmp_path / "ca_path") + os.environ["CONSUL_CLIENT_CERT"] = str(tmp_path / "client_cert") + os.environ["CONSUL_CLIENT_KEY"] = str(tmp_path / "client_key") + os.environ["SLACK_WEBHOOK_URL_CONSUL"] = "slack_webhook_url" + + for i in [ + "CONSUL_CACERT", + "CONSUL_CAPATH", + "CONSUL_CLIENT_CERT", + "CONSUL_CLIENT_KEY", + ]: + pathlib.Path(os.environ[i]).touch() + + with monkeypatch.context() as m: + + def request_url(*args, **kwargs): + if ( + kwargs["method"] == "get" + and kwargs["url"] == "https://localhost:8501/v1/health/state/any" + ): + resp = requests.Response() + resp.url = kwargs["url"] + resp.status_code = 200 + resp.encoding = "utf-8" + + resp.raw = io.BytesIO() + resp.raw.write( + json.dumps( + [ + { + "Node": "test.example.com", + "Name": "Check name", + "Status": "critical", + "ServiceName": "Service name", + } + ] + ).encode(resp.encoding) + ) + resp.raw.seek(0) + + return resp + + if ( + kwargs["method"] == "get" + and kwargs["url"] == "https://localhost:8501/v1/status/leader" + ): + resp = requests.Response() + resp.url = kwargs["url"] + resp.status_code = 200 + resp.encoding = "utf-8" + + resp.raw = io.BytesIO() + resp.raw.write("127.0.0.1:8300".encode(resp.encoding)) + resp.raw.seek(0) + return resp + if ( + kwargs["method"] == "get" + and kwargs["url"] + == "http://169.254.169.254/latest/meta-data/local-ipv4" + ): + resp = requests.Response() + resp.url = kwargs["url"] + resp.status_code = 200 + resp.encoding = "utf-8" + + resp.raw = io.BytesIO() + resp.raw.write("127.0.0.1".encode(resp.encoding)) + resp.raw.seek(0) + return resp + if kwargs["method"] == "post" and kwargs["url"] == "slack_webhook_url": + resp = requests.Response() + resp.url = kwargs["url"] + resp.status_code = 200 + resp.encoding = "utf-8" + + resp.raw = io.BytesIO() + resp.raw.seek(0) + return resp + + raise ValueError() + + m.setattr("requests.sessions.Session.request", request_url) + + actual_exit_code = main(["--debug", "consul-report", "aws"]) + + stdout, stderr = capsys.readouterr() + assert "*PASSING*: `consul-report` on `" in stdout + assert stderr == "" + assert caplog.record_tuples == [] + assert actual_exit_code == 0