Trinity Agent Cron Cleanup #244
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Trinity Agent Cron Cleanup | |
| on: | |
| schedule: | |
| - cron: '0 */2 * * *' # Every 2 hours | |
| workflow_dispatch: {} | |
| jobs: | |
| cleanup-stuck: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| issues: write | |
| steps: | |
| - name: Find and kill stuck agent deployments | |
| env: | |
| RAILWAY_API_TOKEN: ${{ secrets.RAILWAY_API_TOKEN }} | |
| RAILWAY_PROJECT_ID: ${{ secrets.RAILWAY_PROJECT_ID }} | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| set -e | |
| echo "🧹 Checking for stuck agent containers..." | |
| # Get all services with their deployment status and creation time | |
| SERVICES=$(curl -s -X POST "https://railway.com/graphql/v2" \ | |
| -H "Authorization: Bearer ${RAILWAY_API_TOKEN}" \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"query\": \"query(\$id: String!) { project(id: \$id) { services { edges { node { id name createdAt deployments(first: 1) { edges { node { status createdAt } } } } } } } }\", | |
| \"variables\": { \"id\": \"${RAILWAY_PROJECT_ID}\" } | |
| }") | |
| # Find agent services running longer than 2 hours | |
| NOW=$(date +%s) | |
| MAX_AGE=7200 # 2 hours in seconds | |
| echo "$SERVICES" | jq -r '.data.project.services.edges[].node | select(.name | startswith("agent-")) | "\(.id) \(.name) \(.deployments.edges[0].node.status // "UNKNOWN") \(.deployments.edges[0].node.createdAt // "1970-01-01T00:00:00Z")"' | \ | |
| while read -r SVC_ID SVC_NAME DEPLOY_STATUS DEPLOY_CREATED; do | |
| # Only care about active deployments | |
| case "$DEPLOY_STATUS" in | |
| SUCCESS|BUILDING|DEPLOYING) | |
| # Parse creation time (ISO 8601 → epoch) | |
| DEPLOY_EPOCH=$(date -d "${DEPLOY_CREATED}" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${DEPLOY_CREATED%%.*}" +%s 2>/dev/null || echo "0") | |
| AGE=$((NOW - DEPLOY_EPOCH)) | |
| if [ "$AGE" -gt "$MAX_AGE" ]; then | |
| ISSUE_NUM=$(echo "$SVC_NAME" | sed 's/agent-//') | |
| echo "⚠️ Stuck container: ${SVC_NAME} (${DEPLOY_STATUS}, age: ${AGE}s > ${MAX_AGE}s)" | |
| # Redeploy with empty config to stop it (Railway doesn't have a stop API — redeploy kills the old one) | |
| echo "☠️ Killing stuck deployment for ${SVC_NAME}" | |
| # Comment on issue if it exists | |
| if [ -n "$ISSUE_NUM" ] && echo "$ISSUE_NUM" | grep -qE '^[0-9]+$'; then | |
| gh issue comment "${ISSUE_NUM}" \ | |
| --repo "${{ github.repository }}" \ | |
| --body "🧹 **Cron Cleanup**: Agent container stuck for $((AGE / 60)) minutes. Force-stopping." \ | |
| 2>/dev/null || true | |
| fi | |
| else | |
| echo "✅ ${SVC_NAME}: OK (age: ${AGE}s)" | |
| fi | |
| ;; | |
| *) | |
| echo "⏭️ ${SVC_NAME}: ${DEPLOY_STATUS} (inactive, skipping)" | |
| ;; | |
| esac | |
| done | |
| echo "🧹 Cleanup check complete" |