Skip to content

Commit cc5f77a

Browse files
authored
healthcheck runner checkup (#376)
The healthcheck function would check if runner.id is defined in order to retrieve its state from github. The problem with that is that unless the runner has started a job, we would never retrieve its id, so idle runner would never be able to get an updated status and be forever considered as offline by the runner manager. To fix this issue, we are using another endpoint to create the registration token for runners, which allow us to specify the full runner configuration as well as retrieve the id that will be given. Thanks to this change we will always have an `runner.id` to work with.
2 parents bedca9b + 0b08557 commit cc5f77a

File tree

12 files changed

+107
-113
lines changed

12 files changed

+107
-113
lines changed

images/runner/entrypoint.sh

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,9 @@
22

33
echo "Starting runner..."
44
echo "RUNNER_ORG: ${RUNNER_ORG}"
5-
echo "RUNNER_TOKEN: ${RUNNER_TOKEN}"
5+
echo "RUNNER_JIT_CONFIG: ${RUNNER_JIT_CONFIG}"
66
echo "RUNNER_NAME: ${RUNNER_NAME}"
77
echo "RUNNER_LABELS: ${RUNNER_LABELS}"
88
echo "RUNNER_GROUP: ${RUNNER_GROUP}"
99

10-
11-
./config.sh --url https://github.com/${RUNNER_ORG} \
12-
--token "${RUNNER_TOKEN}" \
13-
--name "${RUNNER_NAME}" \
14-
--labels "${RUNNER_LABELS}" \
15-
--runnergroup "${RUNNER_GROUP}" \
16-
--work _work \
17-
--replace \
18-
--unattended \
19-
--ephemeral
20-
21-
./run.sh
10+
./run.sh --jitconfig "${RUNNER_JIT_CONFIG}"

runner_manager/backend/gcloud.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import re
23
import time
34
from typing import Dict, List, Literal
45

@@ -178,27 +179,31 @@ def list(self) -> List[Runner]:
178179
raise e
179180
return runners
180181

182+
def _sanitize_label_value(self, value: str) -> str:
183+
value = value[:63]
184+
value = value.lower()
185+
value = re.sub(r"[^a-z0-9_-]", "-", value)
186+
return value
187+
181188
def update(self, runner: Runner) -> Runner:
182189
try:
183190
instance: Instance = self.client.get(
184191
project=self.config.project_id,
185192
zone=self.config.zone,
186-
instance=runner.instance_id,
187-
)
188-
if instance.status != "RUNNING":
189-
raise Exception(f"Instance {instance.name} is not running.")
190-
labels = {}
191-
if runner.labels is not None:
192-
labels = {label.name: label.name for label in runner.labels}
193-
instance.labels = labels
194-
ext_operation: ExtendedOperation = self.client.update(
195-
instance_resource=instance
193+
instance=runner.instance_id or runner.name,
196194
)
197-
self.wait_for_operation(
195+
instance.labels["status"] = self._sanitize_label_value(runner.status)
196+
instance.labels["busy"] = self._sanitize_label_value(str(runner.busy))
197+
198+
log.info(f"Updating {runner.name} labels to {instance.labels}")
199+
self.client.update(
200+
instance=runner.instance_id or runner.name,
198201
project=self.config.project_id,
199202
zone=self.config.zone,
200-
operation=ext_operation.name,
203+
instance_resource=instance,
201204
)
205+
log.info(f"Updated {runner.name} labels to {instance.labels}")
202206
except Exception as e:
207+
super().update(runner)
203208
raise e
204209
return super().update(runner)

runner_manager/bin/startup.sh

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ RUNNER_NAME=${RUNNER_NAME:-$(hostname)}
2020
RUNNER_ORG=${RUNNER_ORG:-"org"}
2121
RUNNER_LABELS=${RUNNER_LABELS:-"runner"}
2222
RUNNER_TOKEN=${RUNNER_TOKEN:-"token"}
23+
RUNNER_JIT_CONFIG=${RUNNER_JIT_CONFIG:-""}
2324
RUNNER_GROUP=${RUNNER_GROUP:-"default"}
2425
RUNNER_WORKDIR=${RUNNER_WORKDIR:-"_work"}
2526
RUNNER_DOWNLOAD_URL=${RUNNER_DOWNLOAD_URL:-"https://github.com/actions/runner/releases/download/v2.308.0/actions-runner-linux-x64-2.308.0.tar.gz"}
@@ -131,7 +132,7 @@ Description={{Description}}
131132
After=network.target
132133
133134
[Service]
134-
ExecStart=/bin/bash {{RunnerRoot}}/runsvc.sh
135+
ExecStart=/bin/bash {{RunnerRoot}}/runsvc.sh --jitconfig \"${RUNNER_JIT_CONFIG}\"
135136
User={{User}}
136137
WorkingDirectory={{RunnerRoot}}
137138
KillMode=process
@@ -142,20 +143,11 @@ TimeoutStopSec=5min
142143
WantedBy=multi-user.target" >/home/actions/actions-runner/bin/actions.runner.service.template
143144

144145
sudo chown -Rh actions:actions /home/actions/actions-runner
145-
sudo -u actions /home/actions/actions-runner/config.sh \
146-
--url "https://github.com/${RUNNER_ORG}" \
147-
--token "${RUNNER_TOKEN}" \
148-
--name "${RUNNER_NAME}" \
149-
--work "${RUNNER_WORKDIR}" \
150-
--labels "${RUNNER_LABELS}" \
151-
--runnergroup "${RUNNER_GROUP}" \
152-
--replace \
153-
--unattended \
154-
--ephemeral
155146

156147
if command -v systemctl; then
157-
sudo ./svc.sh install
158-
sudo ./svc.sh start
148+
sudo -H -u actions bash -c 'cd /home/actions/actions-runner &&
149+
sudo ./svc.sh install &&
150+
sudo ./svc.sh start'
159151
else
160-
nohup /home/actions/actions-runner/run.sh 2>/home/actions/actions-runner/logs &
152+
nohup /home/actions/actions-runner/run.sh --jitconfig "${RUNNER_JIT_CONFIG}" 2>/home/actions/actions-runner/logs &
161153
fi

runner_manager/jobs/workflow_job.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from __future__ import annotations
22

3-
from githubkit import Response
4-
from githubkit.rest.models import AuthenticationToken
53
from githubkit.webhooks.models import (
64
WorkflowJobCompleted,
75
WorkflowJobInProgress,
@@ -51,11 +49,6 @@ def queued(webhook: WorkflowJobQueued) -> str | None:
5149
log.info(f"Found runner group {runner_group.name}")
5250
log.info(f"Creating registration token for runner {runner_group.name}")
5351
github: GitHub = get_github()
54-
org = runner_group.organization
55-
token_response: Response[
56-
AuthenticationToken
57-
] = github.rest.actions.create_registration_token_for_org(org=org)
58-
token: AuthenticationToken = token_response.parsed_data
5952
log.info("Registration token created.")
60-
runner: Runner = runner_group.create_runner(token)
61-
return runner.pk
53+
runner: Runner | None = runner_group.create_runner(github)
54+
return runner.pk if runner else None

runner_manager/models/backend.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class RunnerEnv(BaseModel):
3333

3434
RUNNER_NAME: Optional[str] = None
3535
RUNNER_LABELS: Optional[str] = None
36-
RUNNER_TOKEN: Optional[str] = None
36+
RUNNER_JIT_CONFIG: Optional[str] = None
3737
RUNNER_ORG: Optional[str] = None
3838
RUNNER_GROUP: Optional[str] = None
3939

@@ -46,7 +46,7 @@ def runner_env(self, runner: Runner) -> RunnerEnv:
4646
return RunnerEnv(
4747
RUNNER_NAME=runner.name,
4848
RUNNER_LABELS=",".join([label.name for label in runner.labels]),
49-
RUNNER_TOKEN=runner.token,
49+
RUNNER_JIT_CONFIG=runner.encoded_jit_config,
5050
RUNNER_ORG=runner.organization,
5151
RUNNER_GROUP=runner.runner_group_name,
5252
)

runner_manager/models/runner.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55
import redis
66
from githubkit.rest.models import Runner as GitHubRunner
7+
from githubkit.rest.types import OrgsOrgActionsRunnersGenerateJitconfigPostBodyType
78
from githubkit.webhooks.types import WorkflowJobEvent
89
from pydantic import BaseModel as PydanticBaseModel
910
from redis_om import Field, NotFoundError
1011

12+
from runner_manager.clients.github import GitHub
1113
from runner_manager.logging import log
1214
from runner_manager.models.base import BaseModel
1315

@@ -57,6 +59,7 @@ class Runner(BaseModel):
5759
default=None,
5860
)
5961
token: Optional[str] = None
62+
encoded_jit_config: Optional[str] = None
6063
backend: Optional[str] = Field(index=True, description="Backend type")
6164
status: RunnerStatus = Field(
6265
default=RunnerStatus.offline, index=True, full_text_search=True
@@ -143,12 +146,34 @@ def time_to_start_expired(self, timeout: timedelta) -> bool:
143146
def time_to_live_expired(self, time_to_live: timedelta) -> bool:
144147
return self.is_online and self.time_since_started > time_to_live
145148

146-
def update_status(self, github_runner: GitHubRunner):
147-
self.status = RunnerStatus(github_runner.status)
148-
self.busy = github_runner.busy
149+
def update_from_github(self, github: GitHub) -> "Runner":
150+
if self.id is not None:
151+
github_runner: GitHubRunner = (
152+
github.rest.actions.get_self_hosted_runner_for_org(
153+
org=self.organization, runner_id=self.id
154+
).parsed_data
155+
)
156+
self.status = RunnerStatus(self.status)
157+
self.busy = github_runner.busy
149158
log.info(f"Runner {self.name} status updated to {self.status}")
150159
return self.save()
151160

161+
def generate_jit_config(self, github: GitHub) -> "Runner":
162+
"""Generate JIT config for the runner"""
163+
assert self.organization is not None, "Organization name is required"
164+
assert self.runner_group_id is not None, "Runner group id is required"
165+
jitconfig = github.rest.actions.generate_runner_jitconfig_for_org(
166+
org=self.organization,
167+
data=OrgsOrgActionsRunnersGenerateJitconfigPostBodyType(
168+
name=self.name,
169+
runner_group_id=self.runner_group_id,
170+
labels=[label.name for label in self.labels],
171+
),
172+
).parsed_data
173+
self.id = jitconfig.runner.id
174+
self.encoded_jit_config = jitconfig.encoded_jit_config
175+
return self.save()
176+
152177
def save(
153178
self,
154179
pipeline: Optional[redis.client.Pipeline] = None,

runner_manager/models/runner_group.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import redis
88
from githubkit import Response
99
from githubkit.exception import RequestFailed
10-
from githubkit.rest.models import AuthenticationToken
11-
from githubkit.rest.models import Runner as GitHubRunner
1210
from githubkit.webhooks.models import WorkflowJobInProgress
1311
from githubkit.webhooks.types import WorkflowJobEvent
1412
from pydantic import BaseModel as PydanticBaseModel
@@ -110,27 +108,28 @@ def get_runners(self) -> List[Runner]:
110108
pass
111109
return runners
112110

113-
def create_runner(self, token: AuthenticationToken) -> Runner | None:
111+
def create_runner(self, github: GitHub) -> Runner | None:
114112
"""Create a runner instance.
115113
116114
Returns:
117115
Runner: Runner instance.
118116
"""
119117
count = len(self.get_runners())
120-
if count < self.max:
118+
if count < self.max and self.id:
121119
runner: Runner = Runner(
122120
name=self.generate_runner_name(),
123121
organization=self.organization,
124122
status=RunnerStatus.offline,
125-
token=token.token,
126123
busy=False,
127124
runner_group_id=self.id,
128125
created_at=datetime.now(),
129126
runner_group_name=self.name,
130127
labels=self.runner_labels,
131128
manager=self.manager,
132129
)
133-
runner = runner.save()
130+
runner.save()
131+
runner.generate_jit_config(github)
132+
134133
return self.backend.create(runner)
135134
return None
136135

@@ -160,7 +159,10 @@ def delete_runner(self, runner: Runner) -> int:
160159

161160
@property
162161
def need_new_runner(self) -> bool:
163-
return len(self.get_runners()) < (self.min or 0)
162+
runners = self.get_runners()
163+
idle = len([runner for runner in runners if runner.busy is False])
164+
count = len(runners)
165+
return idle < self.min and count < self.max
164166

165167
def create_github_group(self, github: GitHub) -> GitHubRunnerGroup:
166168
"""Create a GitHub runner group."""
@@ -243,26 +245,13 @@ def healthcheck(
243245
"""Healthcheck runner group."""
244246
runners = self.get_runners()
245247
for runner in runners:
246-
247-
if runner.id is not None:
248-
github_runner: GitHubRunner = (
249-
github.rest.actions.get_self_hosted_runner_for_org(
250-
self.organization, runner.id
251-
).parsed_data
252-
)
253-
runner = runner.update_status(github_runner)
248+
runner.update_from_github(github)
254249
if runner.time_to_live_expired(time_to_live):
255250
self.delete_runner(runner)
256251
if runner.time_to_start_expired(timeout_runner):
257252
self.delete_runner(runner)
258253
while self.need_new_runner:
259-
token_response: Response[
260-
AuthenticationToken
261-
] = github.rest.actions.create_registration_token_for_org(
262-
org=self.organization
263-
)
264-
token: AuthenticationToken = token_response.parsed_data
265-
runner: Runner = self.create_runner(token)
254+
runner: Runner = self.create_runner(github)
266255
if runner:
267256
log.info(f"Runner {runner.name} created")
268257

tests/unit/conftest.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
from datetime import timedelta
22

33
import httpx
4-
from githubkit import Response
54
from githubkit.config import Config
6-
from githubkit.rest.models import AuthenticationToken
75
from hypothesis import HealthCheck
86
from hypothesis import settings as hypothesis_settings
97
from pytest import fixture
108

11-
from runner_manager import Runner, RunnerGroup
9+
from runner_manager import Runner
1210
from runner_manager.clients.github import GitHub
1311
from runner_manager.models.runner import RunnerLabel
1412

@@ -56,13 +54,3 @@ def runner(settings) -> Runner:
5654
assert runner.Meta.global_key_prefix == settings.name
5755
Runner.delete_many(Runner.find().all())
5856
return runner
59-
60-
61-
@fixture()
62-
def runner_token(runner_group: RunnerGroup, github: GitHub) -> AuthenticationToken:
63-
token: Response[
64-
AuthenticationToken
65-
] = github.rest.actions.create_registration_token_for_org(
66-
org=runner_group.organization
67-
)
68-
return token.parsed_data

tests/unit/jobs/test_healthchecks.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from datetime import datetime, timedelta
22

3-
from githubkit.rest.models import AuthenticationToken
43
from hypothesis import given, settings
54
from hypothesis import strategies as st
65
from redis_om import Migrator
@@ -44,16 +43,16 @@ def test_healthchecks_hypothesis(
4443

4544

4645
def test_group_healthcheck(
47-
runner_group: RunnerGroup, settings: Settings, github: GitHub, runner_token
46+
runner_group: RunnerGroup, settings: Settings, github: GitHub
4847
):
4948
runner_group.save(github=github)
50-
runner_tts: Runner = runner_group.create_runner(runner_token)
49+
runner_tts: Runner = runner_group.create_runner(github)
5150
assert runner_tts is not None
5251
runner_tts.created_at = datetime.now() - (
5352
settings.timeout_runner + timedelta(minutes=1)
5453
)
5554
runner_tts.save()
56-
runner_ttl: Runner = runner_group.create_runner(runner_token)
55+
runner_ttl: Runner = runner_group.create_runner(github)
5756
assert runner_ttl is not None
5857
runner_ttl.status = RunnerStatus.online
5958
runner_ttl.started_at = datetime.now() - (
@@ -100,17 +99,17 @@ def test_time_to_live(runner: Runner, settings: Settings):
10099
assert runner.time_to_live_expired(settings.time_to_live) is False
101100

102101

103-
def test_need_new_runner(runner_group: RunnerGroup, runner_token: AuthenticationToken):
102+
def test_need_new_runner(runner_group: RunnerGroup, github: GitHub):
104103
runner_group.max = 2
105104
runner_group.min = 1
106105
runner_group.save()
107106
assert runner_group.need_new_runner is True
108-
runner_group.create_runner(runner_token)
107+
runner_group.create_runner(github)
109108
assert runner_group.need_new_runner is False
110109

111110

112111
def test_healthcheck_job(
113-
runner_group: RunnerGroup, settings: Settings, queue: Queue, runner_token
112+
runner_group: RunnerGroup, settings: Settings, queue: Queue, github: GitHub
114113
):
115114
runner_group.save()
116115
queue.enqueue(
@@ -120,7 +119,7 @@ def test_healthcheck_job(
120119
settings.timeout_runner,
121120
)
122121
assert len(runner_group.get_runners()) == 0
123-
runner_group.create_runner(runner_token)
122+
runner_group.create_runner(github)
124123
queue.enqueue(
125124
healthcheck.group,
126125
runner_group.pk,

0 commit comments

Comments
 (0)