Skip to content

Trinity Agent Cron Cleanup #244

Trinity Agent Cron Cleanup

Trinity Agent Cron Cleanup #244

name: Trinity Agent Cron Cleanup
on:
schedule:
- cron: '0 */2 * * *' # Every 2 hours
workflow_dispatch: {}
jobs:
cleanup-stuck:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
steps:
- name: Find and kill stuck agent deployments
env:
RAILWAY_API_TOKEN: ${{ secrets.RAILWAY_API_TOKEN }}
RAILWAY_PROJECT_ID: ${{ secrets.RAILWAY_PROJECT_ID }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -e
echo "🧹 Checking for stuck agent containers..."
# Get all services with their deployment status and creation time
SERVICES=$(curl -s -X POST "https://railway.com/graphql/v2" \
-H "Authorization: Bearer ${RAILWAY_API_TOKEN}" \
-H "Content-Type: application/json" \
-d "{
\"query\": \"query(\$id: String!) { project(id: \$id) { services { edges { node { id name createdAt deployments(first: 1) { edges { node { status createdAt } } } } } } } }\",
\"variables\": { \"id\": \"${RAILWAY_PROJECT_ID}\" }
}")
# Find agent services running longer than 2 hours
NOW=$(date +%s)
MAX_AGE=7200 # 2 hours in seconds
echo "$SERVICES" | jq -r '.data.project.services.edges[].node | select(.name | startswith("agent-")) | "\(.id) \(.name) \(.deployments.edges[0].node.status // "UNKNOWN") \(.deployments.edges[0].node.createdAt // "1970-01-01T00:00:00Z")"' | \
while read -r SVC_ID SVC_NAME DEPLOY_STATUS DEPLOY_CREATED; do
# Only care about active deployments
case "$DEPLOY_STATUS" in
SUCCESS|BUILDING|DEPLOYING)
# Parse creation time (ISO 8601 → epoch)
DEPLOY_EPOCH=$(date -d "${DEPLOY_CREATED}" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${DEPLOY_CREATED%%.*}" +%s 2>/dev/null || echo "0")
AGE=$((NOW - DEPLOY_EPOCH))
if [ "$AGE" -gt "$MAX_AGE" ]; then
ISSUE_NUM=$(echo "$SVC_NAME" | sed 's/agent-//')
echo "⚠️ Stuck container: ${SVC_NAME} (${DEPLOY_STATUS}, age: ${AGE}s > ${MAX_AGE}s)"
# Redeploy with empty config to stop it (Railway doesn't have a stop API — redeploy kills the old one)
echo "☠️ Killing stuck deployment for ${SVC_NAME}"
# Comment on issue if it exists
if [ -n "$ISSUE_NUM" ] && echo "$ISSUE_NUM" | grep -qE '^[0-9]+$'; then
gh issue comment "${ISSUE_NUM}" \
--repo "${{ github.repository }}" \
--body "🧹 **Cron Cleanup**: Agent container stuck for $((AGE / 60)) minutes. Force-stopping." \
2>/dev/null || true
fi
else
echo "✅ ${SVC_NAME}: OK (age: ${AGE}s)"
fi
;;
*)
echo "⏭️ ${SVC_NAME}: ${DEPLOY_STATUS} (inactive, skipping)"
;;
esac
done
echo "🧹 Cleanup check complete"