diff --git a/.github/workflows/sync-db.yml b/.github/workflows/sync-db.yml new file mode 100644 index 00000000..419bfee5 --- /dev/null +++ b/.github/workflows/sync-db.yml @@ -0,0 +1,388 @@ +name: Sync Production DB to Staging (from backups) + +on: + schedule: + # Run daily at 2 AM UTC (during low-traffic hours) + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +permissions: + contents: read + +jobs: + sync-database: + name: Sync Prod DB to Staging from k8up Backups + runs-on: ubuntu-latest + environment: staging + concurrency: + group: sync-staging-database + cancel-in-progress: false + steps: + - name: Authenticate to Google Cloud (Production) + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 + with: + credentials_json: ${{ secrets.GCP_PROD_SERVICE_ACCOUNT_KEY }} + + - name: Setup Google Cloud SDK + uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db + with: + project_id: mcp-registry-prod + install_components: gke-gcloud-auth-plugin + + - name: Get backup credentials from prod cluster + id: backup-creds + run: | + gcloud container clusters get-credentials mcp-registry-prod \ + --zone=us-central1-b \ + --project=mcp-registry-prod + + # Store in outputs (GitHub Actions encrypts these automatically) + kubectl get secret k8up-backup-credentials -n default -o json | jq -r ' + "access_key=" + (.data.AWS_ACCESS_KEY_ID | @base64d), + "secret_key=" + (.data.AWS_SECRET_ACCESS_KEY | @base64d) + ' >> $GITHUB_OUTPUT + + - name: Remove all production access (SAFETY MEASURE) + run: | + # Remove production cluster from kubeconfig + kubectl config delete-context gke_mcp-registry-prod_us-central1-b_mcp-registry-prod 2>/dev/null || true + + # Revoke gcloud credentials + gcloud auth revoke --all 2>/dev/null || true + + # Clear gcloud configuration + gcloud config unset project 2>/dev/null || true + gcloud config unset account 2>/dev/null || true + + # Verify no contexts remain + CONTEXT_COUNT=$(kubectl config get-contexts -o name 2>/dev/null | wc -l) + if [ "$CONTEXT_COUNT" -gt 0 ]; then + echo "❌ ERROR: $CONTEXT_COUNT context(s) still exist after cleanup!" + kubectl config get-contexts + exit 1 + fi + + - name: Switch to staging cluster + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 + with: + credentials_json: ${{ secrets.GCP_STAGING_SERVICE_ACCOUNT_KEY }} + + - name: Configure staging cluster access + run: | + gcloud config set project mcp-registry-staging + gcloud container clusters get-credentials mcp-registry-staging \ + --zone=us-central1-b \ + --project=mcp-registry-staging + + - name: Create secret for prod backup bucket access + run: | + # Create/update secret in staging with access to prod backups + kubectl create secret generic prod-to-staging-sync-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${{ steps.backup-creds.outputs.access_key }}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${{ steps.backup-creds.outputs.secret_key }}" \ + --dry-run=client -o yaml | kubectl apply -f - + + - name: Create restore PVC + run: | + kubectl apply -f - <> $GITHUB_OUTPUT + + # Create a k8up Restore resource to restore from prod backups + kubectl apply -f - </dev/null) + if [ -n "$JOB_NAME" ]; then + echo "Found restore job: $JOB_NAME" + break + fi + echo "Waiting for job to be created... ($i/30)" + sleep 2 + done + + if [ -z "$JOB_NAME" ]; then + echo "ERROR: Restore job not found" + kubectl get restore $RESTORE_NAME -n default -o yaml + exit 1 + fi + + # Wait for the restore job to complete (max 15 minutes) + kubectl wait --for=condition=complete \ + job/$JOB_NAME \ + --timeout=900s -n default || { + echo "Restore job failed or timed out" + kubectl describe job/$JOB_NAME -n default + kubectl logs job/$JOB_NAME -n default --tail=100 + exit 1 + } + + - name: Find staging PostgreSQL PVC + id: pgdata-pvc + run: | + # Find the PVC used by the PostgreSQL cluster + PVC_NAME=$(kubectl get pvc -n default -l cnpg.io/cluster=registry-pg -o jsonpath='{.items[0].metadata.name}') + + if [ -z "$PVC_NAME" ]; then + echo "ERROR: Could not find PostgreSQL PVC" + kubectl get pvc -n default -l cnpg.io/cluster=registry-pg + exit 1 + fi + + echo "pvc_name=$PVC_NAME" >> $GITHUB_OUTPUT + + - name: Scale down staging PostgreSQL + run: | + echo "Scaling down PostgreSQL cluster..." + kubectl patch cluster registry-pg -n default \ + --type merge \ + --patch '{"spec":{"instances":0}}' + + # Wait for pods to terminate + echo "Waiting for pods to terminate..." + kubectl wait --for=delete pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s || true + + - name: Verify we are in staging cluster (SAFETY CHECK) + run: | + # Get current cluster context + CURRENT_CONTEXT=$(kubectl config current-context) + CURRENT_PROJECT=$(gcloud config get-value project) + + echo "Current kubectl context: $CURRENT_CONTEXT" + echo "Current GCP project: $CURRENT_PROJECT" + + # Verify we're in staging + if ! echo "$CURRENT_CONTEXT" | grep -qi "staging"; then + echo "❌ SAFETY CHECK FAILED: Not in staging cluster" + echo "Context: $CURRENT_CONTEXT" + echo "Expected: staging cluster" + exit 1 + fi + + if [ "$CURRENT_PROJECT" != "mcp-registry-staging" ]; then + echo "❌ SAFETY CHECK FAILED: Not in staging project" + echo "Project: $CURRENT_PROJECT" + echo "Expected: mcp-registry-staging" + exit 1 + fi + + - name: Replace staging database with restored backup + id: copy-job + run: | + JOB_NAME="copy-pgdata-$(date +%Y%m%d-%H%M%S)" + echo "job_name=$JOB_NAME" >> $GITHUB_OUTPUT + + # Create a job to copy the restored backup data to the staging PVC + kubectl apply -f - </dev/null | head -20 + + # Try different possible paths for pgdata + PGDATA_SOURCE="" + for path in \$(find /restore -type d -name "pgdata" 2>/dev/null); do + if [ -f "\$path/PG_VERSION" ]; then + PGDATA_SOURCE="\$path" + break + fi + done + + if [ -z "\$PGDATA_SOURCE" ]; then + echo "ERROR: Could not find valid pgdata directory with PG_VERSION" + echo "Searched paths:" + find /restore -type d -name "pgdata" 2>/dev/null + exit 1 + fi + + echo "Found pgdata at: \$PGDATA_SOURCE" + echo "Contents:" + ls -lah \$PGDATA_SOURCE/ | head -10 + + echo "Backing up existing staging data..." + mkdir -p /pgdata-backup + if [ "\$(ls -A /pgdata)" ]; then + cp -a /pgdata/. /pgdata-backup/ || echo "Warning: Could not backup existing data" + fi + + echo "Clearing existing data..." + rm -rf /pgdata/* + + echo "Copying backup data to staging PVC..." + cp -a \$PGDATA_SOURCE/. /pgdata/ + + echo "Setting correct permissions..." + chmod 700 /pgdata + + ls -lah /pgdata/ | head -20 + echo "PostgreSQL version: \$(cat /pgdata/PG_VERSION)" + volumeMounts: + - name: restore-data + mountPath: /restore + - name: staging-pgdata + mountPath: /pgdata + volumes: + - name: restore-data + persistentVolumeClaim: + claimName: restore-data-pvc + - name: staging-pgdata + persistentVolumeClaim: + claimName: ${{ steps.pgdata-pvc.outputs.pvc_name }} + EOF + + - name: Wait for data copy to complete + run: | + JOB_NAME="${{ steps.copy-job.outputs.job_name }}" + + # Wait for copy to complete + kubectl wait --for=condition=complete job/$JOB_NAME --timeout=600s -n default || { + echo "Data copy job failed" + kubectl describe job/$JOB_NAME -n default + kubectl logs job/$JOB_NAME -n default --tail=100 + exit 1 + } + + - name: Scale up staging PostgreSQL + run: | + echo "Scaling up PostgreSQL cluster..." + kubectl patch cluster registry-pg -n default \ + --type merge \ + --patch '{"spec":{"instances":1}}' + + # Wait for PostgreSQL pod to be created + echo "Waiting for PostgreSQL pod to be created..." + for i in {1..60}; do + POD_COUNT=$(kubectl get pods -l cnpg.io/cluster=registry-pg -n default --no-headers 2>/dev/null | wc -l) + if [ "$POD_COUNT" -gt 0 ]; then + echo "Pod created" + break + fi + echo "Waiting... ($i/60)" + sleep 2 + done + + # Wait for PostgreSQL to be ready + echo "Waiting for PostgreSQL to be ready..." + kubectl wait --for=condition=ready pod -l cnpg.io/cluster=registry-pg -n default --timeout=300s + + - name: Verify staging DB is functional + run: | + # Create a verification pod + kubectl run pg-verify-$(date +%s) \ + --image=postgres:15 \ + --rm -i --restart=Never \ + --env="PGPASSWORD=$(kubectl get secret registry-pg-superuser -n default -o jsonpath='{.data.password}' | base64 -d)" \ + -- bash -c ' + echo "Waiting for database to accept connections..." + for i in {1..30}; do + if pg_isready -h registry-pg-rw -U postgres 2>/dev/null; then + break + fi + echo "Waiting... ($i/30)" + sleep 2 + done + + echo "Querying database..." + TABLE_COUNT=$(psql -h registry-pg-rw -U postgres -d app -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '\''public'\'';" 2>&1) + + if [ $? -ne 0 ]; then + echo "ERROR: Could not query database" + echo "$TABLE_COUNT" + exit 1 + fi + + if [ "$TABLE_COUNT" -lt 1 ]; then + echo "ERROR: Staging DB has no tables!" + exit 1 + fi + + echo "Staging DB has $TABLE_COUNT tables" + echo "Top 10 tables by row count:" + psql -h registry-pg-rw -U postgres -d app \ + -c "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;" || true + ' + + - name: Cleanup + if: always() + run: | + # Clean up jobs first + if [ -n "${{ steps.copy-job.outputs.job_name }}" ]; then + kubectl delete job ${{ steps.copy-job.outputs.job_name }} -n default || true + fi + + # Remove restore PVC (will wait for jobs to finish) + kubectl delete pvc restore-data-pvc -n default || true + + # Remove prod backup credentials (for security) + kubectl delete secret prod-to-staging-sync-credentials -n default || true + + # Clean up old restore resources (keep last 3) + kubectl get restore -n default --sort-by=.metadata.creationTimestamp -o name | head -n -3 | xargs -r kubectl delete || true + + # Clean up old copy jobs (keep last 3) + kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep 'copy-pgdata-' | head -n -3 | xargs -r kubectl delete -n default || true