From 7964b40b398b2e6c53fbbb9606723c547f6c38ba Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 22 Aug 2023 16:34:15 +0100 Subject: [PATCH 01/16] aufn/ceph: Update tenks.yml storage allocation --- etc/kayobe/environments/aufn-ceph/tenks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/environments/aufn-ceph/tenks.yml b/etc/kayobe/environments/aufn-ceph/tenks.yml index 9b0e9e9f4..6e19fcf4c 100644 --- a/etc/kayobe/environments/aufn-ceph/tenks.yml +++ b/etc/kayobe/environments/aufn-ceph/tenks.yml @@ -21,9 +21,9 @@ node_types: volumes: # There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent: # https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290 - - capacity: 10GiB + - capacity: 20GiB # Ceph volume - - capacity: 10GiB + - capacity: 20GiB physical_networks: - provision-net - cloud-net @@ -34,7 +34,7 @@ node_types: volumes: # There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent: # https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290 - - capacity: 10GiB + - capacity: 20GiB physical_networks: - provision-net - cloud-net From f5db1eb5cc7cc3087d478a3f28ad02f3f2ac5463 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Thu, 24 Aug 2023 09:51:35 +0100 Subject: [PATCH 02/16] aufn/ceph: Change default storage capacities --- etc/kayobe/environments/aufn-ceph/tenks.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/environments/aufn-ceph/tenks.yml b/etc/kayobe/environments/aufn-ceph/tenks.yml index 6e19fcf4c..ea7e470cc 100644 --- a/etc/kayobe/environments/aufn-ceph/tenks.yml +++ b/etc/kayobe/environments/aufn-ceph/tenks.yml @@ -21,7 +21,7 @@ node_types: volumes: # There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent: # https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290 - - capacity: 20GiB + - capacity: 15GiB # Ceph volume - capacity: 20GiB physical_networks: @@ -34,7 +34,7 @@ node_types: volumes: # There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent: # https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290 - - capacity: 20GiB + - capacity: 15GiB physical_networks: - provision-net - cloud-net @@ -87,3 +87,5 @@ bridge_type: linuxbridge # No placement service. wait_for_placement: false + +libvirt_vm_trust_guest_rx_filters: false From bb9263c67c42c61a1649b8e23223da36fcd6bd55 Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 19 Aug 2024 09:54:26 +0100 Subject: [PATCH 03/16] fix: typo in playbook run command --- doc/source/configuration/monitoring.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index e40f12b11..0d83a2c8c 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -85,7 +85,7 @@ on the overcloud hosts: .. code-block:: console (kayobe) [stack@node ~]$ cd etc/kayobe - (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml + (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmon-tools.yml SMART reporting should now be enabled along with a Prometheus alert for unhealthy disks and a Grafana dashboard called ``Hardware Overview``. From eba7674e29706dd60a8787959be5a57312ea4c23 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Tue, 20 Aug 2024 14:31:06 +0100 Subject: [PATCH 04/16] Bump kayobe automation --- .automation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.automation b/.automation index 98e92aae8..7eefdb99f 160000 --- a/.automation +++ b/.automation @@ -1 +1 @@ -Subproject commit 98e92aae8460db84cd4bf9813e4ef1ba02c5e034 +Subproject commit 7eefdb99fe60df8eeac63004878ab4d44eb6d6ba From 5ee3d28fbf6af8d3c7591f509d09ad3bc7196f1d Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 27 Jun 2024 20:18:39 +0100 Subject: [PATCH 05/16] CI: Add multinode workflow This change adds a GitHub workflow to deploy a multinode test cluster using a workflow dispatch (manual) trigger. --- .github/workflows/stackhpc-multinode.yml | 70 ++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/stackhpc-multinode.yml diff --git a/.github/workflows/stackhpc-multinode.yml b/.github/workflows/stackhpc-multinode.yml new file mode 100644 index 000000000..59da95e13 --- /dev/null +++ b/.github/workflows/stackhpc-multinode.yml @@ -0,0 +1,70 @@ +--- +# This workflow provides a workflow_dispatch (manual) trigger to deploy a +# multi-node test cluster. + +name: Multinode +'on': + workflow_dispatch: + # NOTE: workflow_dispatch is limited to 10 inputs. + inputs: + multinode_name: + description: Multinode cluster name + type: string + required: true + os_distribution: + description: Host OS distribution + type: choice + default: rocky + options: + - rocky + - ubuntu + neutron_plugin: + description: Neutron ML2 plugin + type: choice + default: ovn + options: + - ovn + - ovs + upgrade: + description: Whether to perform an upgrade + type: boolean + default: false + break_on: + description: When to break execution for manual interaction + type: choice + default: never + options: + - always + - failure + - never + - success + break_duration: + description: How long to break execution for (minutes) + type: number + default: 60 + ssh_key: + description: SSH public key to authorise on Ansible control host + type: string + terraform_kayobe_multinode_version: + description: terraform-kayobe-multinode version + type: string + default: main +jobs: + multinode: + name: Multinode + uses: stackhpc/stackhpc-openstack-gh-workflows/.github/workflows/multinode.yml@1.0.0 + with: + multinode_name: ${{ inputs.multinode_name }} + os_distribution: ${{ inputs.os_distribution }} + os_release: ${{ inputs.os_distribution == 'rocky' && '9' || 'jammy' }} + ssh_username: ${{ inputs.os_distribution == 'rocky' && 'cloud-user' || 'ubuntu' }} + neutron_plugin: ${{ inputs.neutron_plugin }} + upgrade: ${{ inputs.upgrade }} + break_on: ${{ inputs.break_on }} + break_duration: ${{ inputs.break_duration }} + ssh_key: ${{ inputs.ssh_key }} + stackhpc_kayobe_config_version: ${{ github.ref_name }} + # NOTE(upgrade): Reference the PREVIOUS release here. + stackhpc_kayobe_config_previous_version: stackhpc/zed + terraform_kayobe_multinode_version: ${{ inputs.terraform_kayobe_multinode_version }} + secrets: inherit From e84bb74c6a60bb43a92826aeac91bd714c8bd6cd Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Aug 2024 09:21:38 +0100 Subject: [PATCH 06/16] CI: Fix loss of number type when calling reusable workflow The workflow_dispatch break_duration input has a type of 'number', but this somehow gets lost when calling the reusable workflow. We see the following error: The template is not valid. .github/workflows/stackhpc-multinode.yml (Line: 64, Col: 23): Unexpected value '60' This issue is described in https://github.com/orgs/community/discussions/67182. We use the fromJSON workaround in the thread. --- .github/workflows/stackhpc-multinode.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc-multinode.yml b/.github/workflows/stackhpc-multinode.yml index 59da95e13..353a07334 100644 --- a/.github/workflows/stackhpc-multinode.yml +++ b/.github/workflows/stackhpc-multinode.yml @@ -61,7 +61,8 @@ jobs: neutron_plugin: ${{ inputs.neutron_plugin }} upgrade: ${{ inputs.upgrade }} break_on: ${{ inputs.break_on }} - break_duration: ${{ inputs.break_duration }} + # Workaround loss of number type using fromJSON: https://github.com/orgs/community/discussions/67182 + break_duration: ${{ fromJSON(inputs.break_duration) }} ssh_key: ${{ inputs.ssh_key }} stackhpc_kayobe_config_version: ${{ github.ref_name }} # NOTE(upgrade): Reference the PREVIOUS release here. From 810d9875e41b299f473446e2fe0e0aadccb4f6c8 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Tue, 20 Aug 2024 16:24:38 +0100 Subject: [PATCH 07/16] Make fix-hostname playbook more generic --- etc/kayobe/ansible/fix-hostname.yml | 4 ++-- etc/kayobe/environments/ci-aio/inventory/groups | 3 +++ etc/kayobe/environments/ci-multinode/inventory/groups | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/ansible/fix-hostname.yml b/etc/kayobe/ansible/fix-hostname.yml index 92730b7f0..dc3c92e32 100644 --- a/etc/kayobe/ansible/fix-hostname.yml +++ b/etc/kayobe/ansible/fix-hostname.yml @@ -1,6 +1,6 @@ --- -- name: Fix hostname on storage nodes for cephadm - hosts: storage +- name: Ensure hostnames match inventory hostnames + hosts: fix-hostname gather_facts: false vars: ansible_user: "{{ bootstrap_user }}" diff --git a/etc/kayobe/environments/ci-aio/inventory/groups b/etc/kayobe/environments/ci-aio/inventory/groups index 43967dac1..39b72601a 100644 --- a/etc/kayobe/environments/ci-aio/inventory/groups +++ b/etc/kayobe/environments/ci-aio/inventory/groups @@ -3,3 +3,6 @@ [container-image-builders:children] # Build container images on the all-in-one controller. controllers + +[fix-hostname:children] +controllers diff --git a/etc/kayobe/environments/ci-multinode/inventory/groups b/etc/kayobe/environments/ci-multinode/inventory/groups index 43967dac1..08018ca3a 100644 --- a/etc/kayobe/environments/ci-multinode/inventory/groups +++ b/etc/kayobe/environments/ci-multinode/inventory/groups @@ -3,3 +3,6 @@ [container-image-builders:children] # Build container images on the all-in-one controller. controllers + +[fix-hostname:children] +storage From ac9677a619fc2b710a7e0981dff0064c25791dc8 Mon Sep 17 00:00:00 2001 From: Michal Nasiadka Date: Wed, 21 Aug 2024 11:52:06 +0200 Subject: [PATCH 08/16] magnum-capi-helm: bump to 1.1.0 --- etc/kayobe/kolla-image-tags.yml | 3 +++ etc/kayobe/kolla.yml | 2 +- releasenotes/notes/magnum-capi-1-1-0-68f14759413316c4.yaml | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/magnum-capi-1-1-0-68f14759413316c4.yaml diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index 4b0143c2e..cf8f74ada 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -18,6 +18,9 @@ kolla_image_tags: rocky-9: 2023.1-rocky-9-20240809T102431 letsencrypt: ubuntu-jammy: 2023.1-ubuntu-jammy-20240509T102329 + magnum: + rocky-9: 2023.1-rocky-9-20240821T102442 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240821T102442 manila: rocky-9: 2023.1-rocky-9-20240809T102431 neutron: diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index 2df7c4cac..45c322fcf 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -344,7 +344,7 @@ kolla_build_blocks: magnum_base_footer: | RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | head -n -1 | bash {% raw %} - {% set magnum_capi_packages = ['magnum-capi-helm==1.0.0'] %} + {% set magnum_capi_packages = ['magnum-capi-helm==1.1.0'] %} RUN {{ macros.install_pip(magnum_capi_packages | customizable("pip_packages")) }} {% endraw %} prometheus_alertmanager_repository_version: | # 2023.1 kolla has 0.24.0 diff --git a/releasenotes/notes/magnum-capi-1-1-0-68f14759413316c4.yaml b/releasenotes/notes/magnum-capi-1-1-0-68f14759413316c4.yaml new file mode 100644 index 000000000..8290ba39c --- /dev/null +++ b/releasenotes/notes/magnum-capi-1-1-0-68f14759413316c4.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + ``magnum-capi-helm`` driver has been updated to 1.1.0. + Please see `magnum-capi-helm release notes `_ + for changes. From c2218abed3e8c6fca9b1b32ea43deb457b4aa24f Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 17 May 2024 11:54:24 +0100 Subject: [PATCH 09/16] Add diagnostics.sh This is a copy of roles/kayobe-diagnostics/files/get_logs.sh in Kayobe. (cherry picked from commit 36dc6d10405f7181da29051230f0830d66992062) --- tools/diagnostics.sh | 147 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tools/diagnostics.sh diff --git a/tools/diagnostics.sh b/tools/diagnostics.sh new file mode 100644 index 000000000..639969575 --- /dev/null +++ b/tools/diagnostics.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +# NOTE(mgoddard): This has been adapted from tests/get_logs.sh in Kolla +# Ansible. + +# Environment variables: +# $LOG_DIR is the directory to copy logs to. +# $CONFIG_DIR is the directory to copy configuration from. +# $PREVIOUS_CONFIG_DIR is the directory to copy previous configuration, prior +# to an upgrade, from. + +set +o errexit + +copy_logs() { + cp -rnL /var/lib/docker/volumes/kolla_logs/_data/* ${LOG_DIR}/kolla/ + if [[ -d ${CONFIG_DIR} ]]; then + cp -rnL ${CONFIG_DIR}/etc/kayobe/* ${LOG_DIR}/kayobe_configs + cp -rnL ${CONFIG_DIR}/etc/kolla/* ${LOG_DIR}/kolla_configs + cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs + # Don't save the IPA images. + rm ${LOG_DIR}/kayobe_configs/kolla/config/ironic/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs} + fi + if [[ -n ${PREVIOUS_CONFIG_DIR} ]] && [[ -d ${PREVIOUS_CONFIG_DIR} ]]; then + mkdir -p ${LOG_DIR}/previous_{kayobe,kolla}_configs + cp -rnL ${PREVIOUS_CONFIG_DIR}/etc/kayobe/* ${LOG_DIR}/previous_kayobe_configs + cp -rnL ${PREVIOUS_CONFIG_DIR}/etc/kolla/* ${LOG_DIR}/previous_kolla_configs + # NOTE: we can't save node configs in /etc/kolla for the pervious + # release since they'll have been overwritten at this point. + # Don't save the IPA images. + rm ${LOG_DIR}/previous_kayobe_configs/kolla/config/ironic/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/previous_kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} + fi + + if [[ -d /opt/kayobe/etc/kolla ]]; then + cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/ + fi + + cp -rvnL /var/log/* ${LOG_DIR}/system_logs/ + + if [[ -x "$(command -v journalctl)" ]]; then + journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.txt + journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log + journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log + journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log + else + cp /var/log/upstart/docker.log ${LOG_DIR}/system_logs/docker.log + fi + + if [[ -d /etc/sysconfig/network-scripts/ ]]; then + cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/NetworkManager/system-connections/ ]]; then + cp -r /etc/NetworkManager/system-connections/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/yum.repos.d/ ]]; then + cp -r /etc/yum.repos.d/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/apt/sources.list.d/ ]]; then + cp -r /etc/apt/sources.list.d/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/systemd/ ]]; then + cp -rL /etc/systemd/ ${LOG_DIR}/system_logs/ + fi + + df -h > ${LOG_DIR}/system_logs/df.txt + # Gather disk usage statistics for files and directories larger than 1MB + du -d 5 -hx / | sort -hr | grep '^[0-9\.]*[MGT]' > ${LOG_DIR}/system_logs/du.txt + free > ${LOG_DIR}/system_logs/free.txt + cat /etc/hosts > ${LOG_DIR}/system_logs/hosts.txt + parted -l > ${LOG_DIR}/system_logs/parted-l.txt + mount > ${LOG_DIR}/system_logs/mount.txt + env > ${LOG_DIR}/system_logs/env.txt + ip address > ${LOG_DIR}/system_logs/ip-address.txt + ip route > ${LOG_DIR}/system_logs/ip-route.txt + ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt + ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt + + iptables-save > ${LOG_DIR}/system_logs/iptables.txt + + if [ `command -v dpkg` ]; then + dpkg -l > ${LOG_DIR}/system_logs/dpkg-l.txt + fi + if [ `command -v rpm` ]; then + rpm -qa > ${LOG_DIR}/system_logs/rpm-qa.txt + fi + + # final memory usage and process list + ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt + + # available entropy + cat /proc/sys/kernel/random/entropy_avail > ${LOG_DIR}/system_logs/entropy_avail.txt + + # docker related information + (docker info && docker images && docker ps -a) > ${LOG_DIR}/system_logs/docker-info.txt + + for container in $(docker ps -a --format "{{.Names}}"); do + docker logs --tail all ${container} &> ${LOG_DIR}/docker_logs/${container}.txt + done + + # Bifrost: grab config files and logs from the container. + if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then + for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do + mkdir -p ${LOG_DIR}/kolla/$service + docker exec bifrost_deploy \ + systemctl status $service -l -n 10000 > ${LOG_DIR}/kolla/$service/${service}-systemd-status.txt + docker exec bifrost_deploy \ + journalctl -u $service --no-pager > ${LOG_DIR}/kolla/$service/${service}-journal.txt + done + docker exec -it bifrost_deploy \ + journalctl --no-pager > ${LOG_DIR}/kolla/bifrost-journal.log + for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do + docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/ + done + docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/kolla/mariadb/ + fi + + # IPA build logs + if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then + mkdir -p ${LOG_DIR}/kayobe + cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/kayobe/ + fi + + # Overcloud host image build logs + if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then + mkdir -p ${LOG_DIR}/kayobe + cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/kayobe/ + fi + + # Rename files to .txt; this is so that when displayed via + # logs.openstack.org clicking results in the browser shows the + # files, rather than trying to send it to another app or make you + # download it, etc. + for f in $(find ${LOG_DIR}/{system_logs,kolla,docker_logs} -name "*.log"); do + mv $f ${f/.log/.txt} + done + + chmod -R 777 ${LOG_DIR} +} + +copy_logs From 66ce723c56a691190481b4e5fc76cd2aabd993ff Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 17 May 2024 11:56:07 +0100 Subject: [PATCH 10/16] Add diagnostics.yml playbook This playbook runs a script that collects diagnostic information from hosts. The diagnostics are aggregated to a directory (diagnostics_path_local/inventory_hostname) on localhost. NOTE: The diagnostic information contains sensitive information such as passwords in configuration files. (cherry picked from commit 03bf7f01f5f16a6735dcdac3ff5406b6a5cdd591) --- etc/kayobe/ansible/diagnostics.yml | 74 +++++++++++++++++ .../notes/diagnostics-378a6693a64d0b3c.yaml | 14 ++++ tools/diagnostics.sh | 81 +++++++------------ 3 files changed, 117 insertions(+), 52 deletions(-) create mode 100644 etc/kayobe/ansible/diagnostics.yml create mode 100644 releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml diff --git a/etc/kayobe/ansible/diagnostics.yml b/etc/kayobe/ansible/diagnostics.yml new file mode 100644 index 000000000..c737de2d7 --- /dev/null +++ b/etc/kayobe/ansible/diagnostics.yml @@ -0,0 +1,74 @@ +--- +# This playbook runs a script that collects diagnostic information from hosts. +# The diagnostics are aggregated to a directory +# (diagnostics_path_local/inventory_hostname) on localhost. +# +# NOTE: The diagnostic information contains sensitive information such as +# passwords in configuration files. + +- name: Collect diagnostic information + hosts: seed-hypervisor:seed:overcloud:infra-vms + vars: + diagnostics_path_local: "{{ lookup('env', 'PWD') }}/diagnostics" + tasks: + - block: + - name: Create a temporary directory for diagnostics + ansible.builtin.tempfile: + state: directory + suffix: diagnostics + register: diagnostics_tmpdir + + - name: Write host variables to a file + ansible.builtin.copy: + content: "{{ hostvars[inventory_hostname].ansible_facts | to_nice_json }}" + dest: "{{ diagnostics_tmpdir.path }}/facts.json" + + - name: Run diagnostics script + ansible.builtin.script: "{{ kayobe_config_path }}/../../tools/diagnostics.sh" + become: true + failed_when: diagnostics_result.rc is not defined + register: diagnostics_result + environment: + LOG_DIR: "{{ diagnostics_tmpdir.path }}" + CONFIG_DIR: "{{ kayobe_config_path }}/../.." + + - name: Download diagnostic logs to localhost + ansible.posix.synchronize: + src: "{{ diagnostics_tmpdir.path }}/" + dest: "{{ diagnostics_path_local }}/{{ inventory_hostname }}" + mode: pull + archive: no + recursive: true + copy_links: true + verify_host: true + # For jump host + use_ssh_args: true + vars: + # FIXME: The synchronize module fails on Yoga, due to not templating + # the SSH user. + ansible_user: stack + always: + - name: Clean up temporary directory + ansible.builtin.file: + path: "{{ diagnostics_tmpdir.path }}" + state: absent + + - name: Display diagnostics collection stdout + ansible.builtin.debug: + msg: "{{ diagnostics_result.stdout }}" + when: diagnostics_result.stdout is defined + + - name: Display diagnostics collection stderr + ansible.builtin.debug: + msg: "{{ diagnostics_result.stderr }}" + when: diagnostics_result.stderr is defined + + - name: Fail if diagnostics collection failed + ansible.builtin.fail: + msg: Diagnostics collection failed + when: diagnostics_result.rc != 0 + + - name: Display location of diagnostics archive + ansible.builtin.debug: + msg: >- + Wrote diagnostics to {{ diagnostics_path_local }} on localhost diff --git a/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml b/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml new file mode 100644 index 000000000..4e8d45ee7 --- /dev/null +++ b/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml @@ -0,0 +1,14 @@ +--- +features: + - | + Adds a new ``diagnostics.yml`` playbook that collects diagnostic + information from hosts. The diagnostics are aggregated to a directory + (``$PWD/diagnostics/`` by default) on localhost. The diagnostics include: + + * Docker container logs + * Kolla configuration files + * Log files + + *The collected diagnostic information contains sensitive information such + as passwords in configuration files.* + diff --git a/tools/diagnostics.sh b/tools/diagnostics.sh index 639969575..73d61775a 100644 --- a/tools/diagnostics.sh +++ b/tools/diagnostics.sh @@ -1,53 +1,34 @@ #!/bin/bash -# NOTE(mgoddard): This has been adapted from tests/get_logs.sh in Kolla -# Ansible. +# NOTE(mgoddard): This has been adapted from +# roles/kayobe-diagnostics/files/get_logs.sh in Kayobe. # Environment variables: # $LOG_DIR is the directory to copy logs to. -# $CONFIG_DIR is the directory to copy configuration from. -# $PREVIOUS_CONFIG_DIR is the directory to copy previous configuration, prior -# to an upgrade, from. +# TODO: Make this script more robust and use set -e. set +o errexit +set -u copy_logs() { - cp -rnL /var/lib/docker/volumes/kolla_logs/_data/* ${LOG_DIR}/kolla/ - if [[ -d ${CONFIG_DIR} ]]; then - cp -rnL ${CONFIG_DIR}/etc/kayobe/* ${LOG_DIR}/kayobe_configs - cp -rnL ${CONFIG_DIR}/etc/kolla/* ${LOG_DIR}/kolla_configs - cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs - # Don't save the IPA images. - rm ${LOG_DIR}/kayobe_configs/kolla/config/ironic/ironic-agent.{kernel,initramfs} - rm ${LOG_DIR}/kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} - rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs} - rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs} - fi - if [[ -n ${PREVIOUS_CONFIG_DIR} ]] && [[ -d ${PREVIOUS_CONFIG_DIR} ]]; then - mkdir -p ${LOG_DIR}/previous_{kayobe,kolla}_configs - cp -rnL ${PREVIOUS_CONFIG_DIR}/etc/kayobe/* ${LOG_DIR}/previous_kayobe_configs - cp -rnL ${PREVIOUS_CONFIG_DIR}/etc/kolla/* ${LOG_DIR}/previous_kolla_configs - # NOTE: we can't save node configs in /etc/kolla for the pervious - # release since they'll have been overwritten at this point. - # Don't save the IPA images. - rm ${LOG_DIR}/previous_kayobe_configs/kolla/config/ironic/ironic-agent.{kernel,initramfs} - rm ${LOG_DIR}/previous_kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} - fi + mkdir -p ${LOG_DIR}/{docker_logs,kolla_node_configs,system_logs} + + cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs + # Don't save the IPA images. + rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs} if [[ -d /opt/kayobe/etc/kolla ]]; then + mkdir -p ${LOG_DIR}/kolla_build_configs cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/ fi cp -rvnL /var/log/* ${LOG_DIR}/system_logs/ - if [[ -x "$(command -v journalctl)" ]]; then - journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.txt - journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log - journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log - journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log - else - cp /var/log/upstart/docker.log ${LOG_DIR}/system_logs/docker.log - fi + journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.log + journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log + journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log + journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log if [[ -d /etc/sysconfig/network-scripts/ ]]; then cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/ @@ -81,6 +62,9 @@ copy_logs() { ip route > ${LOG_DIR}/system_logs/ip-route.txt ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt + pvs > ${LOG_DIR}/system_logs/pvs.txt + vgs > ${LOG_DIR}/system_logs/vgs.txt + lvs > ${LOG_DIR}/system_logs/lvs.txt iptables-save > ${LOG_DIR}/system_logs/iptables.txt @@ -106,42 +90,35 @@ copy_logs() { # Bifrost: grab config files and logs from the container. if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then + mkdir -p ${LOG_DIR}/bifrost for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do - mkdir -p ${LOG_DIR}/kolla/$service + mkdir -p ${LOG_DIR}/bifrost/$service docker exec bifrost_deploy \ - systemctl status $service -l -n 10000 > ${LOG_DIR}/kolla/$service/${service}-systemd-status.txt + systemctl status $service -l -n 10000 > ${LOG_DIR}/bifrost/$service/${service}-systemd-status.txt docker exec bifrost_deploy \ - journalctl -u $service --no-pager > ${LOG_DIR}/kolla/$service/${service}-journal.txt + journalctl -u $service --no-pager > ${LOG_DIR}/bifrost/$service/${service}-journal.txt done docker exec -it bifrost_deploy \ - journalctl --no-pager > ${LOG_DIR}/kolla/bifrost-journal.log + journalctl --no-pager > ${LOG_DIR}/bifrost/bifrost-journal.log for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/ done - docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/kolla/mariadb/ + docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/bifrost/mariadb/ fi # IPA build logs if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then - mkdir -p ${LOG_DIR}/kayobe - cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/kayobe/ + mkdir -p ${LOG_DIR}/ipa + cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/ipa/ fi # Overcloud host image build logs if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then - mkdir -p ${LOG_DIR}/kayobe - cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/kayobe/ + mkdir -p ${LOG_DIR}/deployment_image + cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/deployment_image/ fi - # Rename files to .txt; this is so that when displayed via - # logs.openstack.org clicking results in the browser shows the - # files, rather than trying to send it to another app or make you - # download it, etc. - for f in $(find ${LOG_DIR}/{system_logs,kolla,docker_logs} -name "*.log"); do - mv $f ${f/.log/.txt} - done - - chmod -R 777 ${LOG_DIR} + chown -R stack: ${LOG_DIR} } copy_logs From 011466cbbf2cce7af13e6d482db2747e141a681c Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Sat, 13 Jan 2024 14:05:50 +0000 Subject: [PATCH 11/16] CI: Collect diagnostic information at end of aio jobs Use the new diagnostics.yml playbook. (cherry picked from commit 834110b5fa320930d4dda65ffc56f1d72c7414b9) --- .github/workflows/stackhpc-all-in-one.yml | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index 82441b710..a3e39e32f 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -156,6 +156,7 @@ jobs: OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} - name: Terraform Apply + id: tf_apply run: | for attempt in $(seq 5); do if terraform apply -auto-approve; then @@ -290,6 +291,7 @@ jobs: KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} - name: Tempest tests + id: tempest run: | mkdir -p tempest-artifacts docker run -t --rm \ @@ -301,11 +303,28 @@ jobs: env: KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} + - name: Collect diagnostic information + id: diagnostics + run: | + mkdir -p diagnostics + sudo -E docker run -t --rm \ + -v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \ + -v $(pwd)/diagnostics:/stack/diagnostics \ + -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \ + $KAYOBE_IMAGE \ + /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml' + env: + KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} + if: ${{ always() && steps.tf_apply.outcome == 'success' }} + - name: Upload test result artifacts uses: actions/upload-artifact@v4 with: - name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }} - path: tempest-artifacts/* + name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }} + path: | + diagnostics/ + tempest-artifacts/ + if: ${{ always() && (steps.tempest.outcome == 'success' || steps.diagnostics.outcome == 'success') }} - name: Fail if any Tempest tests failed run: | From 57e321c1481059d0e3887e2e2dc0816c95c56f73 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 5 Aug 2024 16:28:51 +0100 Subject: [PATCH 12/16] CI: Add SSH key to .ssh for synchronize module in diagnostics.yml --- .github/workflows/stackhpc-all-in-one.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index a3e39e32f..4526e1359 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -116,6 +116,17 @@ jobs: run: ssh-keygen -f id_rsa -N '' working-directory: ${{ github.workspace }}/terraform/aio + # TODO: Remove the following step in Antelope. + # NOTE: In Ansible 2.10 and lower the synchronize module used in the + # ansible/diagnostics.yml playbook does not respect SSH connection + # variables. This may result in Permission Denied issues if using an SSH + # key that is not in ~/.ssh. + - name: Copy SSH keypair to .ssh/ + run: | + install -d ~/.ssh -m 700 && + cp id_rsa* ~/.ssh/ + working-directory: ${{ github.workspace }}/terraform/aio + - name: Generate clouds.yaml run: | cat << EOF > clouds.yaml From 87c3562e201cd466919525664bb79beab50136a2 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 22 Aug 2024 11:04:17 +0100 Subject: [PATCH 13/16] CI: Fix previous version in multinode.yml for Caracal --- .github/workflows/stackhpc-multinode.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc-multinode.yml b/.github/workflows/stackhpc-multinode.yml index 353a07334..22f497a7d 100644 --- a/.github/workflows/stackhpc-multinode.yml +++ b/.github/workflows/stackhpc-multinode.yml @@ -66,6 +66,6 @@ jobs: ssh_key: ${{ inputs.ssh_key }} stackhpc_kayobe_config_version: ${{ github.ref_name }} # NOTE(upgrade): Reference the PREVIOUS release here. - stackhpc_kayobe_config_previous_version: stackhpc/zed + stackhpc_kayobe_config_previous_version: stackhpc/2023.1 terraform_kayobe_multinode_version: ${{ inputs.terraform_kayobe_multinode_version }} secrets: inherit From dd32fce050f087b066e89dca3f5e3010d7d176cf Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 22 Aug 2024 11:09:06 +0100 Subject: [PATCH 14/16] CI: Remove unnecessary step in aio --- .github/workflows/stackhpc-all-in-one.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index e8c14223e..3f5b0fd73 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -137,17 +137,6 @@ jobs: run: ssh-keygen -f id_rsa -N '' working-directory: ${{ github.workspace }}/terraform/aio - # TODO: Remove the following step in Antelope. - # NOTE: In Ansible 2.10 and lower the synchronize module used in the - # ansible/diagnostics.yml playbook does not respect SSH connection - # variables. This may result in Permission Denied issues if using an SSH - # key that is not in ~/.ssh. - - name: Copy SSH keypair to .ssh/ - run: | - install -d ~/.ssh -m 700 && - cp id_rsa* ~/.ssh/ - working-directory: ${{ github.workspace }}/terraform/aio - - name: Generate clouds.yaml run: | cat << EOF > clouds.yaml From 5beed7070185af04c13f0670126d65d83b82bc28 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 22 Aug 2024 11:17:16 +0100 Subject: [PATCH 15/16] Add a default for kolla_build_args It is not an official Kayobe variable, so doesn't have a default. --- etc/kayobe/kolla/kolla-build.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/kolla-build.conf b/etc/kayobe/kolla/kolla-build.conf index b444eae17..9f78e1125 100644 --- a/etc/kayobe/kolla/kolla-build.conf +++ b/etc/kayobe/kolla/kolla-build.conf @@ -8,7 +8,7 @@ base_tag = jammy-20231004 {% elif kolla_base_distro == 'rocky' %} base_tag = 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} {% endif %} -build_args = {{ kolla_build_args.items() | map('join', ':') | join(',') }} +build_args = {{ (kolla_build_args | default({})).items() | map('join', ':') | join(',') }} [openstack-base] type = git From fc0122103f7595775b2403e4c5fbef1944352109 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 22 Aug 2024 11:09:06 +0100 Subject: [PATCH 16/16] CI: Remove unnecessary step in aio --- .github/workflows/stackhpc-all-in-one.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index 056818b83..922515586 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -137,17 +137,6 @@ jobs: run: ssh-keygen -f id_rsa -N '' working-directory: ${{ github.workspace }}/terraform/aio - # TODO: Remove the following step in Antelope. - # NOTE: In Ansible 2.10 and lower the synchronize module used in the - # ansible/diagnostics.yml playbook does not respect SSH connection - # variables. This may result in Permission Denied issues if using an SSH - # key that is not in ~/.ssh. - - name: Copy SSH keypair to .ssh/ - run: | - install -d ~/.ssh -m 700 && - cp id_rsa* ~/.ssh/ - working-directory: ${{ github.workspace }}/terraform/aio - - name: Generate clouds.yaml run: | cat << EOF > clouds.yaml