diff --git a/.github/workflows/firewood-chaos-test.yml b/.github/workflows/firewood-chaos-test.yml new file mode 100644 index 000000000000..5b2e615512bd --- /dev/null +++ b/.github/workflows/firewood-chaos-test.yml @@ -0,0 +1,111 @@ +name: Firewood Chaos Test + +on: + workflow_dispatch: + inputs: + test: + description: 'Test name to run (e.g., firewood-101-250k). Leave empty to use custom inputs below.' + default: '' + # Custom inputs (used when test is not provided) + start-block: + description: 'The start block for the benchmark.' + default: '' + end-block: + description: 'The end block for the benchmark.' + default: '' + block-dir-src: + description: 'The source block directory. Supports S3 directory/zip and local directories.' + default: '' + current-state-dir-src: + description: 'The current state directory. Supports S3 directory/zip and local directories.' + default: '' + # Chaos test specific inputs + config: + description: 'VM config preset (firewood, firewood-archive). Required for custom tests.' + default: 'firewood' + min-wait-time: + description: 'Minimum wait time before killing the process (e.g., 120s, 2m).' + default: '120s' + max-wait-time: + description: 'Maximum wait time before killing the process (e.g., 150s, 3m).' + default: '150s' + runner: + description: 'Runner to execute the chaos test. Input to the runs-on field of the job.' + required: true + timeout-minutes: + description: 'Timeout in minutes for the job.' + default: '60' + # XXX: remove this before merging + pull_request: + schedule: + - cron: '0 9 * * *' # Runs every day at 09:00 UTC (04:00 EST) + +jobs: + define-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.define-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + - name: Define Matrix + id: define-matrix + shell: bash -x {0} + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + { + echo "matrix<> "$GITHUB_OUTPUT" + else + { + echo "matrix<> "$GITHUB_OUTPUT" + fi + + firewood-chaos-test: + needs: define-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.define-matrix.outputs.matrix) }} + timeout-minutes: ${{ matrix.timeout-minutes || 60 }} + runs-on: ${{ matrix.runner || 'ubuntu-latest' }} + permissions: + id-token: write + contents: read + steps: + - uses: cachix/install-nix-action@02a151ada4993995686f9ed4f1be7cfbb229e56f #v31 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_S3_READ_ONLY_ROLE }} + aws-region: 'us-east-2' + role-duration-seconds: '43200' + - uses: actions/checkout@v4 + - name: Run chaos test with Firewood + shell: nix develop --impure --command bash -x {0} + run: ./scripts/run_task.sh test-cchain-reexecution -- "${{ matrix.test || '' }}" + env: + CHAOS_MODE: '1' + START_BLOCK: ${{ matrix.start-block }} + END_BLOCK: ${{ matrix.end-block }} + BLOCK_DIR_SRC: ${{ matrix.block-dir-src }} + CURRENT_STATE_DIR_SRC: ${{ matrix.current-state-dir-src }} + CONFIG: ${{ matrix.config }} + MIN_WAIT_TIME: ${{ matrix.min-wait-time }} + MAX_WAIT_TIME: ${{ matrix.max-wait-time }} + diff --git a/scripts/benchmark_cchain_range.sh b/scripts/benchmark_cchain_range.sh index df37f8a01fb5..1ac92cee635a 100755 --- a/scripts/benchmark_cchain_range.sh +++ b/scripts/benchmark_cchain_range.sh @@ -2,7 +2,7 @@ set -euo pipefail -# C-Chain Re-execution Benchmark Script +# C-Chain Re-execution Benchmark and Chaos Test Script # # Usage: # ./benchmark_cchain_range.sh [test-name] @@ -11,8 +11,12 @@ set -euo pipefail # without a test name and without required env vars. # # Test names configure defaults for S3 sources and block ranges. +# If running in chaos mode, test names also configure defaults for the VM Config +# and min/max wait times. # All defaults can be overridden via environment variables. # +# Note: chaos tests can only be run with firewood VM configs. +# # Environment variables: # Data sources (provide S3 sources OR local paths): # BLOCK_DIR_SRC: S3 object key for blocks (triggers S3 import). @@ -24,14 +28,20 @@ set -euo pipefail # START_BLOCK: The starting block height (inclusive). # END_BLOCK: The ending block height (inclusive). # -# Optional: -# CONFIG: VM config preset (default, archive, firewood). +# Optional (reexecution tests): +# CONFIG: VM config preset (default, archive, firewood, firewood-archive). # LABELS: Comma-separated key=value pairs for metric labels. # BENCHMARK_OUTPUT_FILE: If set, benchmark output is also written to this file. # METRICS_SERVER_ENABLED: If set, enables the metrics server. # METRICS_SERVER_PORT: If set, determines the port the metrics server will listen to. # METRICS_COLLECTOR_ENABLED: If set, enables the metrics collector. # PUSH_POST_STATE: S3 destination to push current-state after execution. +# +# Required (chaos tests): +# CHAOS_MODE: Set to enable chaos test mode (e.g., CHAOS_MODE=1). +# CONFIG: VM config preset (firewood or firewood-archive only). +# MIN_WAIT_TIME: Minimum wait before crash (e.g., 120s). +# MAX_WAIT_TIME: Maximum wait before crash (e.g., 150s). SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -51,11 +61,13 @@ Usage: $0 [test-name] Available tests: help - Show this help message + default - Quick test run (blocks 101-200, hashdb) hashdb-101-250k - Blocks 101-250k with hashdb hashdb-archive-101-250k - Blocks 101-250k with hashdb archive hashdb-33m-33m500k - Blocks 33m-33.5m with hashdb firewood-101-250k - Blocks 101-250k with firewood + firewood-archive-101-250k - Blocks 101-250k with firewood archive firewood-33m-33m500k - Blocks 33m-33.5m with firewood firewood-33m-40m - Blocks 33m-40m with firewood EOF @@ -102,6 +114,13 @@ if [[ -n "$TEST_NAME" ]]; then END_BLOCK="${END_BLOCK:-250000}" CONFIG="${CONFIG:-firewood}" ;; + firewood-archive-101-250k) + BLOCK_DIR_SRC="${BLOCK_DIR_SRC:-cchain-mainnet-blocks-1m-ldb}" + CURRENT_STATE_DIR_SRC="${CURRENT_STATE_DIR_SRC:-cchain-current-state-firewood-archive-100}" + START_BLOCK="${START_BLOCK:-101}" + END_BLOCK="${END_BLOCK:-250000}" + CONFIG="${CONFIG:-firewood-archive}" + ;; firewood-33m-33m500k) BLOCK_DIR_SRC="${BLOCK_DIR_SRC:-cchain-mainnet-blocks-30m-40m-ldb}" CURRENT_STATE_DIR_SRC="${CURRENT_STATE_DIR_SRC:-cchain-current-state-firewood-33m}" @@ -122,6 +141,12 @@ if [[ -n "$TEST_NAME" ]]; then esac fi +# Set chaos test defaults when using a defined test with CHAOS_MODE +if [[ -n "${CHAOS_MODE:-}" && -n "${TEST_NAME:-}" ]]; then + MIN_WAIT_TIME="${MIN_WAIT_TIME:-120s}" + MAX_WAIT_TIME="${MAX_WAIT_TIME:-150s}" +fi + # Determine data source: S3 import or local paths if [[ -n "${BLOCK_DIR_SRC:-}" && -n "${CURRENT_STATE_DIR_SRC:-}" ]]; then # S3 mode - import data @@ -150,6 +175,11 @@ elif [[ -z "${BLOCK_DIR:-}" || -z "${CURRENT_STATE_DIR:-}" ]]; then echo " Block range:" [[ -n "${START_BLOCK:-}" ]] && echo " START_BLOCK: ${START_BLOCK}" || echo " START_BLOCK: (not set)" [[ -n "${END_BLOCK:-}" ]] && echo " END_BLOCK: ${END_BLOCK}" || echo " END_BLOCK: (not set)" + if [[ -n "${CHAOS_MODE:-}" ]]; then + echo " Timeouts (chaos tests):" + [[ -n "${MIN_WAIT_TIME:-}" ]] && echo " MIN_WAIT_TIME: ${MIN_WAIT_TIME}" || echo " MIN_WAIT_TIME: (not set)" + [[ -n "${MAX_WAIT_TIME:-}" ]] && echo " MAX_WAIT_TIME: ${MAX_WAIT_TIME}" || echo " MAX_WAIT_TIME: (not set)" + fi exit 1 fi @@ -158,25 +188,47 @@ if [[ -z "${START_BLOCK:-}" || -z "${END_BLOCK:-}" ]]; then error "START_BLOCK and END_BLOCK are required" fi -echo "=== C-Chain Re-execution: ${TEST_NAME:-custom} ===" +if [[ -n "${CHAOS_MODE:-}" ]]; then + # Chaos tests require additional validation + if [[ -z "${MIN_WAIT_TIME:-}" || -z "${MAX_WAIT_TIME:-}" || -z "${CONFIG:-}" ]]; then + error "MIN_WAIT_TIME and MAX_WAIT_TIME and CONFIG are required for chaos tests" + fi + + echo "=== Firewood Chaos Test: ${TEST_NAME:-custom} ===" + echo "Crashing between ${MIN_WAIT_TIME} and ${MAX_WAIT_TIME}" +else + echo "=== C-Chain Re-execution Test: ${TEST_NAME:-custom} ===" +fi + echo "Blocks: ${START_BLOCK} - ${END_BLOCK}" -echo "Config: ${CONFIG:-default}" - -echo "=== Running re-execution ===" -go run github.com/ava-labs/avalanchego/tests/reexecute/c \ - --block-dir="${BLOCK_DIR}" \ - --current-state-dir="${CURRENT_STATE_DIR}" \ - ${RUNNER_TYPE:+--runner="${RUNNER_TYPE}"} \ - ${CONFIG:+--config="${CONFIG}"} \ - --start-block="${START_BLOCK}" \ - --end-block="${END_BLOCK}" \ - ${LABELS:+--labels="${LABELS}"} \ - ${BENCHMARK_OUTPUT_FILE:+--benchmark-output-file="${BENCHMARK_OUTPUT_FILE}"} \ - ${METRICS_SERVER_ENABLED:+--metrics-server-enabled="${METRICS_SERVER_ENABLED}"} \ - ${METRICS_SERVER_PORT:+--metrics-server-port="${METRICS_SERVER_PORT}"} \ - ${METRICS_COLLECTOR_ENABLED:+--metrics-collector-enabled="${METRICS_COLLECTOR_ENABLED}"} - -if [[ -n "${PUSH_POST_STATE:-}" ]]; then - echo "=== Pushing post-state to S3 ===" - "${SCRIPT_DIR}/copy_dir.sh" "${CURRENT_STATE_DIR}/" "${PUSH_POST_STATE}" +echo "CONFIG: ${CONFIG:-default}" + +echo "=== Running Test ===" +if [[ -n "${CHAOS_MODE:-}" ]]; then + go run ./tests/reexecute/chaos \ + --start-block="${START_BLOCK}" \ + --end-block="${END_BLOCK}" \ + --current-state-dir="${CURRENT_STATE_DIR}" \ + --block-dir="${BLOCK_DIR}" \ + --min-wait-time="${MIN_WAIT_TIME}" \ + --max-wait-time="${MAX_WAIT_TIME}" \ + --config="${CONFIG}" +else + go run github.com/ava-labs/avalanchego/tests/reexecute/c \ + --block-dir="${BLOCK_DIR}" \ + --current-state-dir="${CURRENT_STATE_DIR}" \ + ${RUNNER_TYPE:+--runner="${RUNNER_TYPE}"} \ + ${CONFIG:+--config="${CONFIG}"} \ + --start-block="${START_BLOCK}" \ + --end-block="${END_BLOCK}" \ + ${LABELS:+--labels="${LABELS}"} \ + ${BENCHMARK_OUTPUT_FILE:+--benchmark-output-file="${BENCHMARK_OUTPUT_FILE}"} \ + ${METRICS_SERVER_ENABLED:+--metrics-server-enabled="${METRICS_SERVER_ENABLED}"} \ + ${METRICS_SERVER_PORT:+--metrics-server-port="${METRICS_SERVER_PORT}"} \ + ${METRICS_COLLECTOR_ENABLED:+--metrics-collector-enabled="${METRICS_COLLECTOR_ENABLED}"} + + if [[ -n "${PUSH_POST_STATE:-}" ]]; then + echo "=== Pushing post-state to S3 ===" + "${SCRIPT_DIR}/copy_dir.sh" "${CURRENT_STATE_DIR}/" "${PUSH_POST_STATE}" + fi fi diff --git a/tests/reexecute/chaos/main.go b/tests/reexecute/chaos/main.go new file mode 100644 index 000000000000..5ed575563664 --- /dev/null +++ b/tests/reexecute/chaos/main.go @@ -0,0 +1,257 @@ +// Copyright (C) 2019, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package main + +import ( + "context" + "flag" + "fmt" + "maps" + "math/rand" + "os" + "os/exec" + "path/filepath" + "slices" + "strconv" + "strings" + "syscall" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/api/metrics" + "github.com/ava-labs/avalanchego/database" + "github.com/ava-labs/avalanchego/database/leveldb" + "github.com/ava-labs/avalanchego/graft/coreth/plugin/evm" + "github.com/ava-labs/avalanchego/tests" + "github.com/ava-labs/avalanchego/tests/reexecute" + "github.com/ava-labs/avalanchego/utils/logging" +) + +var ( + blockDirArg string + currentStateDirArg string + startBlockArg uint64 + endBlockArg uint64 + minWaitTimeArg time.Duration + maxWaitTimeArg time.Duration + configNameArg string + configBytesArg []byte + + predefinedConfigs = map[string]string{ + "firewood": `{ + "state-scheme": "firewood", + "snapshot-cache": 0, + "pruning-enabled": true, + "state-sync-enabled": false + }`, + "firewood-archive": `{ + "state-scheme": "firewood", + "snapshot-cache": 0, + "pruning-enabled": false, + "state-sync-enabled": false + }`, + } +) + +func init() { + evm.RegisterAllLibEVMExtras() + + flag.StringVar(&blockDirArg, "block-dir", blockDirArg, "Block DB directory to read from during re-execution.") + flag.StringVar(¤tStateDirArg, "current-state-dir", currentStateDirArg, "Current state directory including VM DB and Chain Data Directory for re-execution.") + flag.Uint64Var(&startBlockArg, "start-block", 101, "Start block to begin execution (exclusive).") + flag.Uint64Var(&endBlockArg, "end-block", 200, "End block to end execution (inclusive).") + flag.DurationVar(&minWaitTimeArg, "min-wait-time", 20*time.Second, "Minimum amount of time to wait before crashing.") + flag.DurationVar(&maxWaitTimeArg, "max-wait-time", 30*time.Second, "Maximum amount of time to wait before crashing.") + + predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs)) + predefinedConfigOptionsStr := fmt.Sprintf("[%s]", strings.Join(predefinedConfigKeys, ", ")) + flag.StringVar(&configNameArg, "config", configNameArg, fmt.Sprintf("Specifies the predefined config to use for the VM. Options include %s.", predefinedConfigOptionsStr)) + + flag.Parse() + + predefinedConfigStr, ok := predefinedConfigs[configNameArg] + if !ok { + fmt.Fprintf(os.Stderr, "invalid config name %q. Valid options include %s.\n", configNameArg, predefinedConfigOptionsStr) + os.Exit(1) + } + configBytesArg = []byte(predefinedConfigStr) +} + +func main() { + tc := tests.NewTestContext(tests.NewDefaultLogger("chaos-test")) + tc.SetDefaultContextParent(context.Background()) + tc.RecoverAndExit() + + run( + tc, + minWaitTimeArg, + maxWaitTimeArg, + blockDirArg, + currentStateDirArg, + startBlockArg, + endBlockArg, + configNameArg, + configBytesArg, + ) +} + +// run executes a chaos test that simulates an application crash during C-Chain +// block reexecution that uses Firewood. It verifies that the VM can recover from +// an unexpected termination and resume processing from the correct block height +// using persisted state. +// +// Running the chaos test involves a few steps: +// 1. Start a reexecution test process using the Firewood state scheme +// 2. Allow the reexecution test to run for the specified wait duration +// 3. Forcefully terminate the process with SIGKILL to simulate a crash +// 4. Open the VM database to read the last accepted block height from persisted state +// 5. Restart the reexecution test from the recovered height to verify state consistency +func run( + tc tests.TestContext, + minWaitTime time.Duration, + maxWaitTime time.Duration, + blockDir string, + currentStateDir string, + startBlock uint64, + endBlock uint64, + configName string, + configBytes []byte, +) { + r := require.New(tc) + log := tc.Log() + + cmd := createReexecutionCmd(blockDir, currentStateDir, startBlock, endBlock, configName) + // Set process group ID so we can kill all child processes + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + // 1. Start a reexecution test process using the Firewood state scheme + r.NoError(cmd.Start()) + + done := make(chan error, 1) + go func() { + done <- cmd.Wait() + }() + + // 2. Allow the reexecution test to run for the specified wait duration + waitTime := time.Duration(rand.Int63n(int64(maxWaitTime-minWaitTime)+1)) + minWaitTime + log.Debug("started reexecution test", zap.Duration("wait time", waitTime)) + + time.Sleep(waitTime) + + // 3. Forcefully terminate the process with SIGKILL to simulate a crash + select { + case waitErr := <-done: + r.FailNow("reexecution test terminated prior to crash test", zap.Error(waitErr)) + default: + pgid, err := syscall.Getpgid(cmd.Process.Pid) + r.NoError(err) + + log.Debug("killing reexecution test") + + r.NoError(syscall.Kill(-pgid, syscall.SIGKILL)) + + waitCtx := tc.DefaultContext() + + var waitErr error + select { + case err := <-done: + waitErr = err + case <-waitCtx.Done(): + r.FailNow("timed out waiting for killed process to terminate") + } + + exitErr, ok := waitErr.(*exec.ExitError) + r.True(ok) + + // ExitCode() returns -1 when killed by signal + r.Equal(-1, exitErr.ProcessState.ExitCode(), "unexpected exit code after kill") + } + + var ( + vmDBDir = filepath.Join(currentStateDir, "db") + chainDataDir = filepath.Join(currentStateDir, "chain-data-dir") + ) + + // 4. Open the VM database to read the last accepted block height from persisted state + db, err := openDB(vmDBDir, 10) + r.NoError(err) + + ctx := tc.GetDefaultContextParent() + vm, err := reexecute.NewMainnetCChainVM( + ctx, + db, + chainDataDir, + configBytes, + metrics.NewPrefixGatherer(), + prometheus.NewRegistry(), + ) + r.NoError(err) + + lastAcceptedID, err := vm.LastAccepted(ctx) + r.NoError(err) + + lastAcceptedBlock, err := vm.GetBlock(ctx, lastAcceptedID) + r.NoError(err) + + r.NoError(vm.Shutdown(ctx)) + r.NoError(db.Close()) + + log.Debug("read VM", zap.Uint64("latest height", lastAcceptedBlock.Height())) + + cmd = createReexecutionCmd(blockDir, currentStateDir, lastAcceptedBlock.Height()+1, endBlock, configName) + + // 5. Restart the reexecution test from the recovered height to verify state consistency + r.NoError(cmd.Run()) +} + +// openDB attempts to open a LevelDB database with retry logic and linear backoff. +// This is necessary after killing a process that held the database open, as the OS may +// need time to release file locks even after the process terminates. +// +// The backoff strategy increases by 500ms per attempt (500ms, 1s, 1.5s, 2s, ...). +func openDB(dbDir string, maxAttempts int) (database.Database, error) { + attempt := 0 + for { + db, err := leveldb.New(dbDir, nil, logging.NoLog{}, prometheus.NewRegistry()) + if err == nil { + return db, nil + } + + attempt += 1 + if attempt == maxAttempts { + return nil, fmt.Errorf("failed to reopen db after %d attempts: %w", maxAttempts, err) + } + + backoff := time.Duration(attempt) * 500 * time.Millisecond + time.Sleep(backoff) + } +} + +// createReexecutionCmd constructs a command to run the C-Chain reexecution test. +func createReexecutionCmd( + blockDir string, + currentStateDir string, + startBlock uint64, + endBlock uint64, + configName string, +) *exec.Cmd { + cmd := exec.Command("go", + "run", + "github.com/ava-labs/avalanchego/tests/reexecute/c", + "--config=firewood", + "--block-dir="+blockDir, + "--current-state-dir="+currentStateDir, + "--start-block="+strconv.Itoa(int(startBlock)), + "--end-block="+strconv.Itoa(int(endBlock)), + "--config="+configName, + ) + + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + return cmd +}