From c7672d5cf667348d729cad454325172fe47effc8 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 8 Oct 2025 10:32:23 +0300 Subject: [PATCH 1/6] Sync prod base to staging Signed-off-by: Radoslav Dimitrov --- .github/workflows/sync-db.yml | 219 ++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 .github/workflows/sync-db.yml diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml new file mode 100644 index 00000000..b6006bfa --- /dev/null +++ b/.github/workflows/sync-db.yml @@ -0,0 +1,219 @@ +name: Sync Production DB to Staging + +on: + schedule: + # Run daily at 2 AM UTC (during low-traffic hours) + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +permissions: + contents: read + +jobs: + sync-database: + name: Sync Prod DB to Staging + runs-on: ubuntu-latest + environment: staging + concurrency: + group: sync-staging-database + cancel-in-progress: false + steps: + - name: Authenticate to Google Cloud (Production) + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 + with: + credentials_json: ${{ secrets.GCP_PROD_SERVICE_ACCOUNT_KEY }} + + - name: Setup Google Cloud SDK + uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db + with: + project_id: mcp-registry-prod + install_components: gke-gcloud-auth-plugin + + - name: Get prod GKE credentials + run: | + gcloud container clusters get-credentials mcp-registry-prod \ + --zone=us-central1-b \ + --project=mcp-registry-prod + + - name: Dump production database + run: | + # Create a job to dump the database from within the prod cluster + kubectl apply -f - < pg_backend_pid();" || true + + echo "Dropping and recreating database..." + PGPASSWORD=\$POSTGRES_PASSWORD psql -h registry-pg-rw -U postgres -d postgres \ + -c "DROP DATABASE IF EXISTS app;" + PGPASSWORD=\$POSTGRES_PASSWORD psql -h registry-pg-rw -U postgres -d postgres \ + -c "CREATE DATABASE app;" + + echo "Restoring database from backup..." + PGPASSWORD=\$POSTGRES_PASSWORD pg_restore \ + -h registry-pg-rw \ + -U postgres \ + -d app \ + --verbose \ + --no-owner \ + --no-acl \ + /backup/prod-backup.dump + + echo "✓ Database restore completed successfully" + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: registry-pg-superuser + key: password + volumeMounts: + - name: backup-data + mountPath: /backup + volumes: + - name: backup-data + configMap: + name: prod-backup + EOF + + # Wait for restore to complete + kubectl wait --for=condition=complete job -l job-name --timeout=600s -n default || { + echo "Restore job failed" + POD_NAME=$(kubectl get pods -l job-name -n default --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}') + kubectl logs $POD_NAME -n default --tail=100 + exit 1 + } + + echo "✓ Restore completed successfully" + + - name: Verify staging DB is functional + if: always() + run: | + # Create a verification pod + kubectl run pg-verify-$(date +%s) \ + --image=postgres:15 \ + --rm -i --restart=Never \ + --env="PGPASSWORD=$(kubectl get secret registry-pg-superuser -n default -o jsonpath='{.data.password}' | base64 -d)" \ + -- bash -c ' + TABLE_COUNT=$(psql -h registry-pg-rw -U postgres -d app -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '\''public'\'';") + + if [ "$TABLE_COUNT" -lt 1 ]; then + echo "ERROR: Staging DB has no tables!" + exit 1 + fi + + echo "✓ Staging DB is healthy with $TABLE_COUNT tables" + + echo "Top 10 tables by row count:" + psql -h registry-pg-rw -U postgres -d app \ + -c "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;" + ' + + - name: Cleanup + if: always() + run: | + # Remove dump file + rm -f /tmp/prod-backup.dump + + # Remove configmap + kubectl delete configmap prod-backup -n default || true + + # Clean up old jobs (keep last 3) + kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep -E 'pg-dump-|pg-restore-' | head -n -3 | xargs -r kubectl delete -n default || true From ec6aa7375595ffb2f6ae80f81f547d028cc33166 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 8 Oct 2025 14:25:47 +0300 Subject: [PATCH 2/6] Use the existing backups of prod instead Signed-off-by: Radoslav Dimitrov --- .github/workflows/sync-db.yml | 391 ++++++++++++++++++++++++---------- 1 file changed, 273 insertions(+), 118 deletions(-) diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml index b6006bfa..7d7c7386 100644 --- a/.github/workflows/sync-db.yml +++ b/.github/workflows/sync-db.yml @@ -1,4 +1,4 @@ -name: Sync Production DB to Staging +name: Sync Production DB to Staging (from backups) on: schedule: @@ -11,7 +11,7 @@ permissions: jobs: sync-database: - name: Sync Prod DB to Staging + name: Sync Prod DB to Staging from k8up Backups runs-on: ubuntu-latest environment: staging concurrency: @@ -29,94 +29,177 @@ jobs: project_id: mcp-registry-prod install_components: gke-gcloud-auth-plugin - - name: Get prod GKE credentials + - name: Get backup credentials from prod cluster + id: backup-creds run: | + # Connect to prod cluster to get backup credentials gcloud container clusters get-credentials mcp-registry-prod \ --zone=us-central1-b \ --project=mcp-registry-prod - - name: Dump production database + # Extract backup credentials from prod cluster + ACCESS_KEY=$(kubectl get secret k8up-backup-credentials -n default -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d) + SECRET_KEY=$(kubectl get secret k8up-backup-credentials -n default -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d) + + # Store in outputs (GitHub Actions encrypts these automatically) + echo "access_key=$ACCESS_KEY" >> $GITHUB_OUTPUT + echo "secret_key=$SECRET_KEY" >> $GITHUB_OUTPUT + + echo "✓ Backup credentials extracted from prod" + + - name: Switch to staging cluster + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 + with: + credentials_json: ${{ secrets.GCP_STAGING_SERVICE_ACCOUNT_KEY }} + + - name: Configure staging cluster access + run: | + gcloud config set project mcp-registry-staging + gcloud container clusters get-credentials mcp-registry-staging \ + --zone=us-central1-b \ + --project=mcp-registry-staging + + echo "✓ Connected to staging cluster" + + - name: Create secret for prod backup bucket access + run: | + # Create/update secret in staging with read-only access to prod backups + kubectl create secret generic prod-backup-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${{ steps.backup-creds.outputs.access_key }}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${{ steps.backup-creds.outputs.secret_key }}" \ + --dry-run=client -o yaml | kubectl apply -f - + + echo "✓ Backup credentials configured in staging" + + - name: Create restore PVC run: | - # Create a job to dump the database from within the prod cluster kubectl apply -f - <> $GITHUB_OUTPUT + + # Create a k8up Restore resource to restore from prod backups + kubectl apply -f - </dev/null) + if [ -n "$JOB_NAME" ]; then + echo "Found restore job: $JOB_NAME" + break + fi + echo "Waiting for job to be created... ($i/30)" + sleep 2 + done - - name: Switch to staging cluster + if [ -z "$JOB_NAME" ]; then + echo "ERROR: Restore job not found" + kubectl get restore $RESTORE_NAME -n default -o yaml + exit 1 + fi + + # Wait for the restore job to complete (max 15 minutes) + kubectl wait --for=condition=complete \ + job/$JOB_NAME \ + --timeout=900s -n default || { + echo "Restore job failed or timed out" + kubectl describe job/$JOB_NAME -n default + kubectl logs job/$JOB_NAME -n default --tail=100 + exit 1 + } + + echo "✓ k8up restore completed successfully" + + - name: Find staging PostgreSQL PVC + id: pgdata-pvc run: | - gcloud config set project mcp-registry-staging - gcloud container clusters get-credentials mcp-registry-staging \ - --zone=us-central1-b \ - --project=mcp-registry-staging + # Find the PVC used by the PostgreSQL cluster + PVC_NAME=$(kubectl get pvc -n default -l cnpg.io/cluster=registry-pg -o jsonpath='{.items[0].metadata.name}') + + if [ -z "$PVC_NAME" ]; then + echo "ERROR: Could not find PostgreSQL PVC" + kubectl get pvc -n default -l cnpg.io/cluster=registry-pg + exit 1 + fi + + echo "pvc_name=$PVC_NAME" >> $GITHUB_OUTPUT + echo "✓ Found PostgreSQL PVC: $PVC_NAME" + + - name: Scale down staging PostgreSQL + run: | + echo "Scaling down PostgreSQL cluster..." + kubectl patch cluster registry-pg -n default \ + --type merge \ + --patch '{"spec":{"instances":0}}' + + # Wait for pods to terminate + echo "Waiting for pods to terminate..." + kubectl wait --for=delete pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s || true - - name: Restore to staging database + echo "✓ PostgreSQL scaled down" + + - name: Replace staging database with restored backup + id: copy-job run: | - # Create a configmap with the dump file - kubectl create configmap prod-backup --from-file=/tmp/prod-backup.dump -n default --dry-run=client -o yaml | kubectl apply -f - + JOB_NAME="copy-pgdata-$(date +%Y%m%d-%H%M%S)" + echo "job_name=$JOB_NAME" >> $GITHUB_OUTPUT - # Create a job to restore the database + # Create a job to copy the restored backup data to the staging PVC kubectl apply -f - </dev/null | head -20 - echo "Waiting for database to be ready..." - until pg_isready -h registry-pg-rw -U postgres; do - sleep 2 + # Try different possible paths for pgdata + PGDATA_SOURCE="" + for path in \$(find /restore -type d -name "pgdata" 2>/dev/null); do + if [ -f "\$path/PG_VERSION" ]; then + PGDATA_SOURCE="\$path" + break + fi done - echo "Terminating existing connections..." - PGPASSWORD=\$POSTGRES_PASSWORD psql -h registry-pg-rw -U postgres -d postgres \ - -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'app' AND pid <> pg_backend_pid();" || true - - echo "Dropping and recreating database..." - PGPASSWORD=\$POSTGRES_PASSWORD psql -h registry-pg-rw -U postgres -d postgres \ - -c "DROP DATABASE IF EXISTS app;" - PGPASSWORD=\$POSTGRES_PASSWORD psql -h registry-pg-rw -U postgres -d postgres \ - -c "CREATE DATABASE app;" - - echo "Restoring database from backup..." - PGPASSWORD=\$POSTGRES_PASSWORD pg_restore \ - -h registry-pg-rw \ - -U postgres \ - -d app \ - --verbose \ - --no-owner \ - --no-acl \ - /backup/prod-backup.dump - - echo "✓ Database restore completed successfully" - env: - - name: POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: registry-pg-superuser - key: password + if [ -z "\$PGDATA_SOURCE" ]; then + echo "ERROR: Could not find valid pgdata directory with PG_VERSION" + echo "Searched paths:" + find /restore -type d -name "pgdata" 2>/dev/null + exit 1 + fi + + echo "Found pgdata at: \$PGDATA_SOURCE" + echo "Contents:" + ls -lah \$PGDATA_SOURCE/ | head -10 + + echo "Backing up existing staging data..." + mkdir -p /pgdata-backup + if [ "\$(ls -A /pgdata)" ]; then + cp -a /pgdata/. /pgdata-backup/ || echo "Warning: Could not backup existing data" + fi + + echo "Clearing existing data..." + rm -rf /pgdata/* + + echo "Copying backup data to staging PVC..." + cp -a \$PGDATA_SOURCE/. /pgdata/ + + echo "Setting correct permissions..." + chmod 700 /pgdata + + echo "✓ Data copy completed" + ls -lah /pgdata/ | head -20 + echo "PostgreSQL version: \$(cat /pgdata/PG_VERSION)" volumeMounts: - - name: backup-data - mountPath: /backup + - name: restore-data + mountPath: /restore + - name: staging-pgdata + mountPath: /pgdata volumes: - - name: backup-data - configMap: - name: prod-backup + - name: restore-data + persistentVolumeClaim: + claimName: restore-data-pvc + - name: staging-pgdata + persistentVolumeClaim: + claimName: ${{ steps.pgdata-pvc.outputs.pvc_name }} EOF - # Wait for restore to complete - kubectl wait --for=condition=complete job -l job-name --timeout=600s -n default || { - echo "Restore job failed" - POD_NAME=$(kubectl get pods -l job-name -n default --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}') - kubectl logs $POD_NAME -n default --tail=100 + echo "✓ Copy job created: $JOB_NAME" + + - name: Wait for data copy to complete + run: | + JOB_NAME="${{ steps.copy-job.outputs.job_name }}" + + # Wait for copy to complete + kubectl wait --for=condition=complete job/$JOB_NAME --timeout=600s -n default || { + echo "Data copy job failed" + kubectl describe job/$JOB_NAME -n default + kubectl logs job/$JOB_NAME -n default --tail=100 exit 1 } - echo "✓ Restore completed successfully" + echo "✓ Database data replaced successfully" + + - name: Scale up staging PostgreSQL + run: | + echo "Scaling up PostgreSQL cluster..." + kubectl patch cluster registry-pg -n default \ + --type merge \ + --patch '{"spec":{"instances":1}}' + + # Wait for PostgreSQL pod to be created + echo "Waiting for PostgreSQL pod to be created..." + for i in {1..60}; do + POD_COUNT=$(kubectl get pods -l cnpg.io/cluster=registry-pg -n default --no-headers 2>/dev/null | wc -l) + if [ "$POD_COUNT" -gt 0 ]; then + echo "Pod created" + break + fi + echo "Waiting... ($i/60)" + sleep 2 + done + + # Wait for PostgreSQL to be ready + echo "Waiting for PostgreSQL to be ready..." + kubectl wait --for=condition=ready pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s + + echo "✓ PostgreSQL is running" - name: Verify staging DB is functional - if: always() run: | # Create a verification pod kubectl run pg-verify-$(date +%s) \ @@ -192,7 +319,23 @@ jobs: --rm -i --restart=Never \ --env="PGPASSWORD=$(kubectl get secret registry-pg-superuser -n default -o jsonpath='{.data.password}' | base64 -d)" \ -- bash -c ' - TABLE_COUNT=$(psql -h registry-pg-rw -U postgres -d app -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '\''public'\'';") + echo "Waiting for database to accept connections..." + for i in {1..30}; do + if pg_isready -h registry-pg-rw -U postgres 2>/dev/null; then + break + fi + echo "Waiting... ($i/30)" + sleep 2 + done + + echo "Querying database..." + TABLE_COUNT=$(psql -h registry-pg-rw -U postgres -d app -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '\''public'\'';" 2>&1) + + if [ $? -ne 0 ]; then + echo "ERROR: Could not query database" + echo "$TABLE_COUNT" + exit 1 + fi if [ "$TABLE_COUNT" -lt 1 ]; then echo "ERROR: Staging DB has no tables!" @@ -203,17 +346,29 @@ jobs: echo "Top 10 tables by row count:" psql -h registry-pg-rw -U postgres -d app \ - -c "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;" + -c "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;" || true ' + echo "✓ Database verification completed" + - name: Cleanup if: always() run: | - # Remove dump file - rm -f /tmp/prod-backup.dump + # Clean up jobs first + if [ -n "${{ steps.copy-job.outputs.job_name }}" ]; then + kubectl delete job ${{ steps.copy-job.outputs.job_name }} -n default || true + fi + + # Remove restore PVC (will wait for jobs to finish) + kubectl delete pvc restore-data-pvc -n default || true + + # Remove prod backup credentials (for security) + kubectl delete secret prod-backup-credentials -n default || true + + # Clean up old restore resources (keep last 3) + kubectl get restore -n default --sort-by=.metadata.creationTimestamp -o name | head -n -3 | xargs -r kubectl delete || true - # Remove configmap - kubectl delete configmap prod-backup -n default || true + # Clean up old copy jobs (keep last 3) + kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep 'copy-pgdata-' | head -n -3 | xargs -r kubectl delete -n default || true - # Clean up old jobs (keep last 3) - kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep -E 'pg-dump-|pg-restore-' | head -n -3 | xargs -r kubectl delete -n default || true + echo "✓ Cleanup completed" From d51f3eab6fe7b049d13f1673399cca436f5efc32 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 8 Oct 2025 15:32:08 +0300 Subject: [PATCH 3/6] Add additional checks to ensure we are in staging Signed-off-by: Radoslav Dimitrov --- .github/workflows/sync-db.yml | 111 +++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml index 7d7c7386..d4bbdfae 100644 --- a/.github/workflows/sync-db.yml +++ b/.github/workflows/sync-db.yml @@ -13,11 +13,24 @@ jobs: sync-database: name: Sync Prod DB to Staging from k8up Backups runs-on: ubuntu-latest - environment: staging + environment: staging # CRITICAL: This ensures we're targeting staging and requires staging environment approval concurrency: group: sync-staging-database cancel-in-progress: false steps: + - name: Initial safety banner + run: | + echo "╔════════════════════════════════════════════════════════════╗" + echo "║ ⚠️ SAFETY NOTICE ⚠️ ║" + echo "║ ║" + echo "║ This workflow will REPLACE staging database with ║" + echo "║ production backup data. This is IRREVERSIBLE. ║" + echo "║ ║" + echo "║ Target: STAGING ONLY ║" + echo "║ Source: Production backups (read-only) ║" + echo "║ ║" + echo "║ Multiple safety checks will verify we're in staging. ║" + echo "╚════════════════════════════════════════════════════════════╝" - name: Authenticate to Google Cloud (Production) uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 with: @@ -160,6 +173,51 @@ jobs: echo "✓ k8up restore completed successfully" + - name: Verify we are in staging cluster (SAFETY CHECK) + run: | + echo "=== SAFETY CHECK: Verifying cluster context ===" + + # Get current cluster context + CURRENT_CONTEXT=$(kubectl config current-context) + CURRENT_PROJECT=$(gcloud config get-value project) + + echo "Current kubectl context: $CURRENT_CONTEXT" + echo "Current GCP project: $CURRENT_PROJECT" + + # CRITICAL: Abort if we're in prod + if echo "$CURRENT_CONTEXT" | grep -qi "prod"; then + echo "❌ SAFETY CHECK FAILED: Currently connected to PRODUCTION cluster!" + echo "Context: $CURRENT_CONTEXT" + echo "ABORTING to prevent data loss" + exit 1 + fi + + if [ "$CURRENT_PROJECT" = "mcp-registry-prod" ]; then + echo "❌ SAFETY CHECK FAILED: Currently in PRODUCTION project!" + echo "Project: $CURRENT_PROJECT" + echo "ABORTING to prevent data loss" + exit 1 + fi + + # Verify we're in staging + if ! echo "$CURRENT_CONTEXT" | grep -qi "staging"; then + echo "❌ SAFETY CHECK FAILED: Not in staging cluster" + echo "Context: $CURRENT_CONTEXT" + echo "Expected: staging cluster" + exit 1 + fi + + if [ "$CURRENT_PROJECT" != "mcp-registry-staging" ]; then + echo "❌ SAFETY CHECK FAILED: Not in staging project" + echo "Project: $CURRENT_PROJECT" + echo "Expected: mcp-registry-staging" + exit 1 + fi + + echo "✅ SAFETY CHECK PASSED: Confirmed we are in STAGING" + echo " Context: $CURRENT_CONTEXT" + echo " Project: $CURRENT_PROJECT" + - name: Find staging PostgreSQL PVC id: pgdata-pvc run: | @@ -175,6 +233,11 @@ jobs: echo "pvc_name=$PVC_NAME" >> $GITHUB_OUTPUT echo "✓ Found PostgreSQL PVC: $PVC_NAME" + # Additional safety: Verify PVC is in staging by checking labels or annotations + PVC_INFO=$(kubectl get pvc $PVC_NAME -n default -o yaml) + echo "PVC Details:" + echo "$PVC_INFO" | grep -E "name:|namespace:|labels:" | head -10 + - name: Scale down staging PostgreSQL run: | echo "Scaling down PostgreSQL cluster..." @@ -188,6 +251,52 @@ jobs: echo "✓ PostgreSQL scaled down" + - name: Final safety check before data replacement + run: | + echo "=== FINAL SAFETY CHECK BEFORE DATA REPLACEMENT ===" + + # Re-verify we're in staging (paranoid check) + CURRENT_CONTEXT=$(kubectl config current-context) + CURRENT_PROJECT=$(gcloud config get-value project) + + if echo "$CURRENT_CONTEXT" | grep -qi "prod" || [ "$CURRENT_PROJECT" = "mcp-registry-prod" ]; then + echo "❌ FINAL SAFETY CHECK FAILED: Detected production environment!" + echo "Context: $CURRENT_CONTEXT" + echo "Project: $CURRENT_PROJECT" + echo "ABORTING IMMEDIATELY" + exit 1 + fi + + # Verify the PVC we're about to modify + PVC_NAME="${{ steps.pgdata-pvc.outputs.pvc_name }}" + echo "About to modify PVC: $PVC_NAME" + + # Check if PVC has any production indicators + if echo "$PVC_NAME" | grep -qi "prod"; then + echo "❌ SAFETY CHECK FAILED: PVC name contains 'prod'" + echo "PVC: $PVC_NAME" + echo "This might be a production PVC. ABORTING." + exit 1 + fi + + # Verify PostgreSQL is scaled down (safety measure) + POD_COUNT=$(kubectl get pods -l cnpg.io/cluster=registry-pg -n default --no-headers 2>/dev/null | wc -l) + if [ "$POD_COUNT" -gt 0 ]; then + echo "❌ SAFETY CHECK FAILED: PostgreSQL pods are still running!" + echo "Expected 0 pods, found: $POD_COUNT" + echo "Database must be scaled down before data replacement" + exit 1 + fi + + echo "✅ FINAL SAFETY CHECK PASSED" + echo " Environment: STAGING" + echo " Context: $CURRENT_CONTEXT" + echo " Project: $CURRENT_PROJECT" + echo " Target PVC: $PVC_NAME" + echo " PostgreSQL pods: 0 (scaled down)" + echo "" + echo "Proceeding with data replacement..." + - name: Replace staging database with restored backup id: copy-job run: | From d4119e91f47963bd9d21995c4804a16392d6ad86 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 8 Oct 2025 15:35:08 +0300 Subject: [PATCH 4/6] Update wrong comment Signed-off-by: Radoslav Dimitrov --- .github/workflows/sync-db.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml index d4bbdfae..d629d41a 100644 --- a/.github/workflows/sync-db.yml +++ b/.github/workflows/sync-db.yml @@ -76,7 +76,7 @@ jobs: - name: Create secret for prod backup bucket access run: | - # Create/update secret in staging with read-only access to prod backups + # Create/update secret in staging with access to prod backups kubectl create secret generic prod-backup-credentials \ --from-literal=AWS_ACCESS_KEY_ID="${{ steps.backup-creds.outputs.access_key }}" \ --from-literal=AWS_SECRET_ACCESS_KEY="${{ steps.backup-creds.outputs.secret_key }}" \ From b94b056a7f4aeaea827742d4f053517ac4f7e5ef Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 8 Oct 2025 15:39:25 +0300 Subject: [PATCH 5/6] Log out from prod once we got the backup credentials Signed-off-by: Radoslav Dimitrov --- .github/workflows/sync-db.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml index d629d41a..4e0eecae 100644 --- a/.github/workflows/sync-db.yml +++ b/.github/workflows/sync-db.yml @@ -60,6 +60,35 @@ jobs: echo "✓ Backup credentials extracted from prod" + - name: Remove all production access (SAFETY MEASURE) + run: | + echo "=== REMOVING ALL PRODUCTION ACCESS ===" + + # Remove production cluster from kubeconfig + kubectl config delete-context gke_mcp-registry-prod_us-central1-b_mcp-registry-prod 2>/dev/null || true + + # Revoke gcloud credentials + gcloud auth revoke --all 2>/dev/null || true + + # Clear gcloud configuration + gcloud config unset project 2>/dev/null || true + gcloud config unset account 2>/dev/null || true + + # Verify no contexts contain "prod" + if kubectl config get-contexts | grep -i prod; then + echo "❌ ERROR: Production context still exists!" + kubectl config get-contexts + exit 1 + fi + + echo "✅ Production access completely removed" + echo " - Kubeconfig cleared" + echo " - GCloud credentials revoked" + echo " - No production contexts remaining" + echo "" + echo "Remaining contexts:" + kubectl config get-contexts || echo "No contexts (expected)" + - name: Switch to staging cluster uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 with: From f39faa62a55c6bd27e5fbc27ec8a28d39cf57b29 Mon Sep 17 00:00:00 2001 From: Adam Jones Date: Thu, 9 Oct 2025 10:53:00 +0000 Subject: [PATCH 6/6] Cleanup workflow a bit --- .github/workflows/sync-db.yml | 178 ++++++---------------------------- 1 file changed, 27 insertions(+), 151 deletions(-) diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml index 4e0eecae..419bfee5 100644 --- a/.github/workflows/sync-db.yml +++ b/.github/workflows/sync-db.yml @@ -13,24 +13,11 @@ jobs: sync-database: name: Sync Prod DB to Staging from k8up Backups runs-on: ubuntu-latest - environment: staging # CRITICAL: This ensures we're targeting staging and requires staging environment approval + environment: staging concurrency: group: sync-staging-database cancel-in-progress: false steps: - - name: Initial safety banner - run: | - echo "╔════════════════════════════════════════════════════════════╗" - echo "║ ⚠️ SAFETY NOTICE ⚠️ ║" - echo "║ ║" - echo "║ This workflow will REPLACE staging database with ║" - echo "║ production backup data. This is IRREVERSIBLE. ║" - echo "║ ║" - echo "║ Target: STAGING ONLY ║" - echo "║ Source: Production backups (read-only) ║" - echo "║ ║" - echo "║ Multiple safety checks will verify we're in staging. ║" - echo "╚════════════════════════════════════════════════════════════╝" - name: Authenticate to Google Cloud (Production) uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 with: @@ -45,25 +32,18 @@ jobs: - name: Get backup credentials from prod cluster id: backup-creds run: | - # Connect to prod cluster to get backup credentials gcloud container clusters get-credentials mcp-registry-prod \ --zone=us-central1-b \ --project=mcp-registry-prod - # Extract backup credentials from prod cluster - ACCESS_KEY=$(kubectl get secret k8up-backup-credentials -n default -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d) - SECRET_KEY=$(kubectl get secret k8up-backup-credentials -n default -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d) - # Store in outputs (GitHub Actions encrypts these automatically) - echo "access_key=$ACCESS_KEY" >> $GITHUB_OUTPUT - echo "secret_key=$SECRET_KEY" >> $GITHUB_OUTPUT - - echo "✓ Backup credentials extracted from prod" + kubectl get secret k8up-backup-credentials -n default -o json | jq -r ' + "access_key=" + (.data.AWS_ACCESS_KEY_ID | @base64d), + "secret_key=" + (.data.AWS_SECRET_ACCESS_KEY | @base64d) + ' >> $GITHUB_OUTPUT - name: Remove all production access (SAFETY MEASURE) run: | - echo "=== REMOVING ALL PRODUCTION ACCESS ===" - # Remove production cluster from kubeconfig kubectl config delete-context gke_mcp-registry-prod_us-central1-b_mcp-registry-prod 2>/dev/null || true @@ -74,21 +54,14 @@ jobs: gcloud config unset project 2>/dev/null || true gcloud config unset account 2>/dev/null || true - # Verify no contexts contain "prod" - if kubectl config get-contexts | grep -i prod; then - echo "❌ ERROR: Production context still exists!" + # Verify no contexts remain + CONTEXT_COUNT=$(kubectl config get-contexts -o name 2>/dev/null | wc -l) + if [ "$CONTEXT_COUNT" -gt 0 ]; then + echo "❌ ERROR: $CONTEXT_COUNT context(s) still exist after cleanup!" kubectl config get-contexts exit 1 fi - echo "✅ Production access completely removed" - echo " - Kubeconfig cleared" - echo " - GCloud credentials revoked" - echo " - No production contexts remaining" - echo "" - echo "Remaining contexts:" - kubectl config get-contexts || echo "No contexts (expected)" - - name: Switch to staging cluster uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 with: @@ -101,18 +74,14 @@ jobs: --zone=us-central1-b \ --project=mcp-registry-staging - echo "✓ Connected to staging cluster" - - name: Create secret for prod backup bucket access run: | # Create/update secret in staging with access to prod backups - kubectl create secret generic prod-backup-credentials \ + kubectl create secret generic prod-to-staging-sync-credentials \ --from-literal=AWS_ACCESS_KEY_ID="${{ steps.backup-creds.outputs.access_key }}" \ --from-literal=AWS_SECRET_ACCESS_KEY="${{ steps.backup-creds.outputs.secret_key }}" \ --dry-run=client -o yaml | kubectl apply -f - - echo "✓ Backup credentials configured in staging" - - name: Create restore PVC run: | kubectl apply -f - <> $GITHUB_OUTPUT - echo "✓ Found PostgreSQL PVC: $PVC_NAME" - - # Additional safety: Verify PVC is in staging by checking labels or annotations - PVC_INFO=$(kubectl get pvc $PVC_NAME -n default -o yaml) - echo "PVC Details:" - echo "$PVC_INFO" | grep -E "name:|namespace:|labels:" | head -10 - name: Scale down staging PostgreSQL run: | @@ -278,54 +190,30 @@ jobs: echo "Waiting for pods to terminate..." kubectl wait --for=delete pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s || true - echo "✓ PostgreSQL scaled down" - - - name: Final safety check before data replacement + - name: Verify we are in staging cluster (SAFETY CHECK) run: | - echo "=== FINAL SAFETY CHECK BEFORE DATA REPLACEMENT ===" - - # Re-verify we're in staging (paranoid check) + # Get current cluster context CURRENT_CONTEXT=$(kubectl config current-context) CURRENT_PROJECT=$(gcloud config get-value project) - if echo "$CURRENT_CONTEXT" | grep -qi "prod" || [ "$CURRENT_PROJECT" = "mcp-registry-prod" ]; then - echo "❌ FINAL SAFETY CHECK FAILED: Detected production environment!" - echo "Context: $CURRENT_CONTEXT" - echo "Project: $CURRENT_PROJECT" - echo "ABORTING IMMEDIATELY" - exit 1 - fi - - # Verify the PVC we're about to modify - PVC_NAME="${{ steps.pgdata-pvc.outputs.pvc_name }}" - echo "About to modify PVC: $PVC_NAME" + echo "Current kubectl context: $CURRENT_CONTEXT" + echo "Current GCP project: $CURRENT_PROJECT" - # Check if PVC has any production indicators - if echo "$PVC_NAME" | grep -qi "prod"; then - echo "❌ SAFETY CHECK FAILED: PVC name contains 'prod'" - echo "PVC: $PVC_NAME" - echo "This might be a production PVC. ABORTING." + # Verify we're in staging + if ! echo "$CURRENT_CONTEXT" | grep -qi "staging"; then + echo "❌ SAFETY CHECK FAILED: Not in staging cluster" + echo "Context: $CURRENT_CONTEXT" + echo "Expected: staging cluster" exit 1 fi - # Verify PostgreSQL is scaled down (safety measure) - POD_COUNT=$(kubectl get pods -l cnpg.io/cluster=registry-pg -n default --no-headers 2>/dev/null | wc -l) - if [ "$POD_COUNT" -gt 0 ]; then - echo "❌ SAFETY CHECK FAILED: PostgreSQL pods are still running!" - echo "Expected 0 pods, found: $POD_COUNT" - echo "Database must be scaled down before data replacement" + if [ "$CURRENT_PROJECT" != "mcp-registry-staging" ]; then + echo "❌ SAFETY CHECK FAILED: Not in staging project" + echo "Project: $CURRENT_PROJECT" + echo "Expected: mcp-registry-staging" exit 1 fi - echo "✅ FINAL SAFETY CHECK PASSED" - echo " Environment: STAGING" - echo " Context: $CURRENT_CONTEXT" - echo " Project: $CURRENT_PROJECT" - echo " Target PVC: $PVC_NAME" - echo " PostgreSQL pods: 0 (scaled down)" - echo "" - echo "Proceeding with data replacement..." - - name: Replace staging database with restored backup id: copy-job run: | @@ -391,7 +279,6 @@ jobs: echo "Setting correct permissions..." chmod 700 /pgdata - echo "✓ Data copy completed" ls -lah /pgdata/ | head -20 echo "PostgreSQL version: \$(cat /pgdata/PG_VERSION)" volumeMounts: @@ -408,8 +295,6 @@ jobs: claimName: ${{ steps.pgdata-pvc.outputs.pvc_name }} EOF - echo "✓ Copy job created: $JOB_NAME" - - name: Wait for data copy to complete run: | JOB_NAME="${{ steps.copy-job.outputs.job_name }}" @@ -422,8 +307,6 @@ jobs: exit 1 } - echo "✓ Database data replaced successfully" - - name: Scale up staging PostgreSQL run: | echo "Scaling up PostgreSQL cluster..." @@ -447,8 +330,6 @@ jobs: echo "Waiting for PostgreSQL to be ready..." kubectl wait --for=condition=ready pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s - echo "✓ PostgreSQL is running" - - name: Verify staging DB is functional run: | # Create a verification pod @@ -480,15 +361,12 @@ jobs: exit 1 fi - echo "✓ Staging DB is healthy with $TABLE_COUNT tables" - + echo "Staging DB has $TABLE_COUNT tables" echo "Top 10 tables by row count:" psql -h registry-pg-rw -U postgres -d app \ -c "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;" || true ' - echo "✓ Database verification completed" - - name: Cleanup if: always() run: | @@ -501,12 +379,10 @@ jobs: kubectl delete pvc restore-data-pvc -n default || true # Remove prod backup credentials (for security) - kubectl delete secret prod-backup-credentials -n default || true + kubectl delete secret prod-to-staging-sync-credentials -n default || true # Clean up old restore resources (keep last 3) kubectl get restore -n default --sort-by=.metadata.creationTimestamp -o name | head -n -3 | xargs -r kubectl delete || true # Clean up old copy jobs (keep last 3) kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep 'copy-pgdata-' | head -n -3 | xargs -r kubectl delete -n default || true - - echo "✓ Cleanup completed"