Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cd): deploy instances with attached cached states #8868

Merged
merged 5 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 69 additions & 38 deletions .github/workflows/cd-deploy-nodes-gcp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,26 @@ on:
type: boolean
default: false

# TODO: Temporarily disabled to reduce network load, see #6894.
#push:
# # Skip main branch updates where Rust code and dependencies aren't modified.
# branches:
# - main
# paths:
# # code and tests
# - '**/*.rs'
# # hard-coded checkpoints and proptest regressions
# - '**/*.txt'
# # dependencies
# - '**/Cargo.toml'
# - '**/Cargo.lock'
# # configuration files
# - '.cargo/config.toml'
# - '**/clippy.toml'
# # workflow definitions
# - 'docker/**'
# - '.dockerignore'
# - '.github/workflows/cd-deploy-nodes-gcp.yml'
# - '.github/workflows/sub-build-docker-image.yml'
push:
# Skip main branch updates where Rust code and dependencies aren't modified.
branches:
- main
paths:
# code and tests
- '**/*.rs'
# hard-coded checkpoints and proptest regressions
- '**/*.txt'
# dependencies
- '**/Cargo.toml'
- '**/Cargo.lock'
# configuration files
- '.cargo/config.toml'
- '**/clippy.toml'
# workflow definitions
- 'docker/**'
- '.dockerignore'
- '.github/workflows/cd-deploy-nodes-gcp.yml'
- '.github/workflows/sub-build-docker-image.yml'

# Only runs the Docker image tests, doesn't deploy any instances
pull_request:
Expand Down Expand Up @@ -176,6 +175,19 @@ jobs:
test_variables: '-e NETWORK -e ZEBRA_CONF_PATH="zebrad/tests/common/configs/v1.0.0-rc.2.toml"'
network: ${{ inputs.network || vars.ZCASH_NETWORK }}

# Finds a `tip` cached state disk for zebra from the main branch
#
# Passes the disk name to subsequent jobs using `cached_disk_name` output
#
get-disk-name:
name: Get disk name
uses: ./.github/workflows/sub-find-cached-disks.yml
with:
network: ${{ inputs.network || vars.ZCASH_NETWORK }}
disk_prefix: zebrad-cache
disk_suffix: tip
prefer_main_cached_state: true

# Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
# with one node in the configured GCP region.
#
Expand All @@ -196,9 +208,11 @@ jobs:
matrix:
network: [Mainnet, Testnet]
name: Deploy ${{ matrix.network }} nodes
needs: [ build, versioning, test-configuration-file, test-zebra-conf-path ]
needs: [ build, versioning, test-configuration-file, test-zebra-conf-path, get-disk-name ]
runs-on: ubuntu-latest
timeout-minutes: 60
env:
CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
Expand Down Expand Up @@ -240,24 +254,31 @@ jobs:
# but the implementation is failing as it's requiring the disk names, contrary to what is stated in the official documentation
- name: Create instance template for ${{ matrix.network }}
run: |
NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
DISK_PARAMS="name=${NAME},device-name=${NAME},size=400GB,type=pd-ssd"
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
else
echo "No cached disk found for ${{ matrix.network }} in main branch"
exit 1
fi
gcloud compute instance-templates create-with-container zebrad-${{ needs.versioning.outputs.major_version || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK} \
--boot-disk-size 300GB \
--machine-type ${{ vars.GCP_SMALL_MACHINE }} \
--boot-disk-size 50GB \
--boot-disk-type=pd-ssd \
--image-project=cos-cloud \
--image-family=cos-stable \
--user-output-enabled \
--metadata google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
--network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
--create-disk="${DISK_PARAMS}" \
--container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${NAME},mode=rw \
--container-stdin \
--container-tty \
--container-image ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} \
--container-env "NETWORK=${{ matrix.network }},LOG_FILE=${{ vars.CD_LOG_FILE }},LOG_COLOR=false,SENTRY_DSN=${{ vars.SENTRY_DSN }}" \
--create-disk=name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},device-name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},auto-delete=yes,size=300GB,type=pd-ssd,mode=rw \
--container-mount-disk=mount-path='/var/cache/zebrad-cache',name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},mode=rw \
--machine-type ${{ vars.GCP_SMALL_MACHINE }} \
--network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
--service-account ${{ vars.GCP_DEPLOYMENTS_SA }} \
--scopes cloud-platform \
--labels=app=zebrad,environment=prod,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \
--metadata google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
--labels=app=zebrad,environment=staging,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \
--tags zebrad

# Check if our destination instance group exists already
Expand Down Expand Up @@ -297,9 +318,11 @@ jobs:
# Note: this instances are not automatically replaced or deleted
deploy-instance:
name: Deploy single ${{ inputs.network }} instance
needs: [ build, test-configuration-file, test-zebra-conf-path ]
needs: [ build, test-configuration-file, test-zebra-conf-path, get-disk-name ]
runs-on: ubuntu-latest
timeout-minutes: 30
env:
CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
Expand Down Expand Up @@ -340,22 +363,30 @@ jobs:
# Create instance template from container image
- name: Manual deploy of a single ${{ inputs.network }} instance running zebrad
run: |
NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
DISK_PARAMS="name=${NAME},device-name=${NAME},size=400GB,type=pd-ssd"
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
else
echo "No cached disk found for ${{ matrix.network }} in main branch"
exit 1
fi
gcloud compute instances create-with-container "zebrad-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}" \
--boot-disk-size 300GB \
--machine-type ${{ vars.GCP_SMALL_MACHINE }} \
--boot-disk-size 50GB \
--boot-disk-type=pd-ssd \
--image-project=cos-cloud \
--image-family=cos-stable \
--user-output-enabled \
--metadata google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
--network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
--create-disk="${DISK_PARAMS}" \
--container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${NAME},mode=rw \
--container-stdin \
--container-tty \
--container-image ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} \
--container-env "NETWORK=${{ inputs.network }},LOG_FILE=${{ inputs.log_file }},LOG_COLOR=false,SENTRY_DSN=${{ vars.SENTRY_DSN }}" \
--create-disk=name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},device-name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},auto-delete=yes,size=300GB,type=pd-ssd,mode=rw \
--container-mount-disk=mount-path='/var/cache/zebrad-cache',name=zebrad-cache-${{ env.GITHUB_SHA_SHORT }}-${NETWORK},mode=rw \
--machine-type ${{ vars.GCP_SMALL_MACHINE }} \
--network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
--service-account ${{ vars.GCP_DEPLOYMENTS_SA }} \
--scopes cloud-platform \
--metadata google-logging-enabled=true,google-monitoring-enabled=true \
--labels=app=zebrad,environment=qa,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \
--tags zebrad \
--zone ${{ vars.GCP_ZONE }}
Expand Down
42 changes: 0 additions & 42 deletions .github/workflows/scripts/gcp-get-available-disks.sh

This file was deleted.

114 changes: 76 additions & 38 deletions .github/workflows/scripts/gcp-get-cached-disks.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
#!/usr/bin/env bash

# Description:
# This script finds a cached Google Cloud Compute image based on specific criteria.
# It prioritizes images from the current commit, falls back to the main branch,
# and finally checks other branches if needed. The selected image is used for
# setting up the environment in a CI/CD pipeline.
#
# If there are multiple disks:
# - prefer images generated from the same commit, then
# - if prefer_main_cached_state is true, prefer images from the `main` branch, then
# - use any images from any other branch or commit.
#
# Within each of these categories:
# - prefer newer images to older images
#
# The selected image is used for setting up the environment in a CI/CD pipeline.
# It also checks if specific disk types are available for subsequent jobs.

set -eo pipefail

# Function to find and report a cached disk image
# Extract local state version
echo "Extracting local state version..."
LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "${GITHUB_WORKSPACE}/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1)
echo "STATE_VERSION: ${LOCAL_STATE_VERSION}"

# Function to find a cached disk image based on the git pattern (commit, main, or any branch)
find_cached_disk_image() {
local search_pattern="${1}"
local git_pattern="${1}"
local git_source="${2}"
local disk_name
local disk_search_pattern="${DISK_PREFIX}-${git_pattern}-v${LOCAL_STATE_VERSION}-${NETWORK}-${DISK_SUFFIX}"

disk_name=$(gcloud compute images list --filter="status=READY AND name~${search_pattern}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
disk_name=$(gcloud compute images list --filter="status=READY AND name~${disk_search_pattern}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)

# Use >&2 to redirect to stderr and avoid sending wrong assignments to stdout
if [[ -n "${disk_name}" ]]; then
Expand All @@ -27,46 +40,71 @@ find_cached_disk_image() {
fi
}

# Extract local state version
echo "Extracting local state version..."
LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "${GITHUB_WORKSPACE}/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1)
echo "STATE_VERSION: ${LOCAL_STATE_VERSION}"
# Check if both $DISK_PREFIX and $DISK_SUFFIX are set, as they are required to find a cached disk image
if [[ -n "${DISK_PREFIX}" && -n "${DISK_SUFFIX}" ]]; then
# Find the most suitable cached disk image
echo "Finding the most suitable cached disk image..."
CACHED_DISK_NAME=""

# First, try to find a cached disk image from the current commit
CACHED_DISK_NAME=$(find_cached_disk_image ".+-${GITHUB_SHA_SHORT}" "commit")

# Define DISK_PREFIX based on the requiring state directory
if [[ "${NEEDS_LWD_STATE}" == "true" ]]; then
DISK_PREFIX="${LWD_STATE_DIR}"
# If no cached disk image is found
if [[ -z "${CACHED_DISK_NAME}" ]]; then
# Check if main branch images are preferred
if [[ "${PREFER_MAIN_CACHED_STATE}" == "true" ]]; then
CACHED_DISK_NAME=$(find_cached_disk_image "main-[0-9a-f]+" "main branch")
# Else, try to find one from any branch
else
CACHED_DISK_NAME=$(find_cached_disk_image ".+-[0-9a-f]+" "any branch")
fi
fi

# Handle case where no suitable disk image is found
if [[ -z "${CACHED_DISK_NAME}" ]]; then
echo "No suitable cached state disk available."
echo "Cached state test jobs must depend on the cached state rebuild job."
exit 1
fi

echo "Selected Disk: ${CACHED_DISK_NAME}"
else
DISK_PREFIX="${ZEBRA_STATE_DIR:-${DISK_PREFIX}}"
echo "DISK_PREFIX or DISK_SUFFIX is not set. Skipping disk image search."
fi

# Find the most suitable cached disk image
echo "Finding the most suitable cached disk image..."
if [[ -z "${CACHED_DISK_NAME}" ]]; then
# Try to find a cached disk image from the current commit
COMMIT_DISK_PREFIX="${DISK_PREFIX}-.+-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${DISK_SUFFIX}"
CACHED_DISK_NAME=$(find_cached_disk_image "${COMMIT_DISK_PREFIX}" "commit")
# If no cached disk image is found, try to find one from the main branch
if [[ "${PREFER_MAIN_CACHED_STATE}" == "true" ]]; then
MAIN_DISK_PREFIX="${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${DISK_SUFFIX}"
CACHED_DISK_NAME=$(find_cached_disk_image "${MAIN_DISK_PREFIX}" "main branch")
# Else, try to find one from any branch
# Function to find and output available disk image types (e.g., lwd_tip_disk, zebra_tip_disk, zebra_checkpoint_disk)
find_available_disk_type() {
local base_name="${1}"
local disk_type="${2}"
local disk_pattern="${base_name}-cache"
local output_var="${base_name}_${disk_type}_disk"
local disk_name

disk_name=$(gcloud compute images list --filter="status=READY AND name~${disk_pattern}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${disk_type}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)

# Use >&2 to redirect to stderr and avoid sending wrong assignments to stdout
if [[ -n "${disk_name}" ]]; then
echo "Found ${disk_type^^} disk: ${disk_name} for ${base_name^^} on network: ${NETWORK}" >&2
disk_description=$(gcloud compute images describe "${disk_name}" --format="value(DESCRIPTION)")
echo "Description: ${disk_description}" >&2
echo "true" # This is the actual return value when a disk is found
else
ANY_DISK_PREFIX="${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${DISK_SUFFIX}"
CACHED_DISK_NAME=$(find_cached_disk_image "${ANY_DISK_PREFIX}" "any branch")
echo "No ${disk_type^^} disk found for ${base_name^^} on network: ${NETWORK}" >&2
echo "false" # This is the actual return value when no disk is found
fi
}
if [[ -n "${NETWORK}" ]]; then
# Check for specific disk images (lwd_tip_disk, zebra_tip_disk, zebra_checkpoint_disk)
echo "Checking for specific disk images..."
LWD_TIP_DISK=$(find_available_disk_type "lwd" "tip")
ZEBRA_TIP_DISK=$(find_available_disk_type "zebrad" "tip")
ZEBRA_CHECKPOINT_DISK=$(find_available_disk_type "zebrad" "checkpoint")
fi

# Handle case where no suitable disk image is found
if [[ -z "${CACHED_DISK_NAME}" ]]; then
echo "No suitable cached state disk available."
echo "Expected pattern: ${COMMIT_DISK_PREFIX}"
echo "Cached state test jobs must depend on the cached state rebuild job."
exit 1
fi

echo "Selected Disk: ${CACHED_DISK_NAME}"

# Exporting variables for subsequent steps
echo "Exporting variables for subsequent steps..."
export CACHED_DISK_NAME="${CACHED_DISK_NAME}"
export LOCAL_STATE_VERSION="${LOCAL_STATE_VERSION}"
export LWD_TIP_DISK="${LWD_TIP_DISK}"
export ZEBRA_TIP_DISK="${ZEBRA_TIP_DISK}"
export ZEBRA_CHECKPOINT_DISK="${ZEBRA_CHECKPOINT_DISK}"
Loading