Skip to content

Commit 195e920

Browse files
authored
PTFE-1196 reduce impact of missed webhooks (#491)
When a webhook is missed by the runner-manager whatever the reason may be, instead of continuously being X runner late for a given group, we continuously check the time it took for a given job to start and create an extra runner if the value is too high. While this may not completely solve the issue, it can reduce its impact on users.
2 parents 5a4c6cb + 7c5c4f7 commit 195e920

File tree

7 files changed

+99
-5
lines changed

7 files changed

+99
-5
lines changed

.devcontainer/devcontainer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
"typeCheckingMode": "basic"
5757
},
5858
"editor": {
59-
"defaultFormatter": "ms-python.python"
59+
"defaultFormatter": "ms-python.black-formatter"
6060
}
6161
}
6262
}

runner_manager/jobs/workflow_job.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
from datetime import timedelta
45

56
from githubkit.webhooks.models import (
67
WorkflowJobCompleted,
@@ -9,8 +10,9 @@
910
)
1011
from githubkit.webhooks.types import WorkflowJobEvent
1112

13+
from runner_manager import Settings
1214
from runner_manager.clients.github import GitHub
13-
from runner_manager.dependencies import get_github
15+
from runner_manager.dependencies import get_github, get_settings
1416
from runner_manager.models.runner import Runner
1517
from runner_manager.models.runner_group import RunnerGroup
1618

@@ -25,6 +27,11 @@ def log_workflow_job(webhook: WorkflowJobEvent) -> None:
2527
)
2628

2729

30+
def time_to_start(webhook: WorkflowJobInProgress | WorkflowJobCompleted) -> timedelta:
31+
"""From a given webhook, calculate the time it took to start the job"""
32+
return webhook.workflow_job.started_at - webhook.workflow_job.created_at
33+
34+
2835
def completed(webhook: WorkflowJobCompleted) -> int:
2936
log_workflow_job(webhook)
3037
runner: Runner | None = Runner.find_from_webhook(webhook)
@@ -47,6 +54,7 @@ def completed(webhook: WorkflowJobCompleted) -> int:
4754

4855
def in_progress(webhook: WorkflowJobInProgress) -> str | None:
4956
log_workflow_job(webhook)
57+
settings: Settings = get_settings()
5058
name: str | None = webhook.workflow_job.runner_name
5159
runner_group: RunnerGroup | None = RunnerGroup.find_from_webhook(webhook)
5260
if not runner_group:
@@ -55,6 +63,17 @@ def in_progress(webhook: WorkflowJobInProgress) -> str | None:
5563
log.info(f"Updating runner {name} in group {runner_group.name}")
5664
runner: Runner = runner_group.update_runner(webhook=webhook)
5765
log.info(f"Runner {name} in group {runner_group.name} has been updated")
66+
tts = time_to_start(webhook)
67+
log.info(f"{runner} took {tts} to start")
68+
# If the time to start is greater than settings.timeout_runner,
69+
# create an extra runner.
70+
# The main reason we perform this action is to ensure that
71+
# in the case we have missed a webhook, we still have a runner
72+
# available for the jobs that are requesting it.
73+
if tts > settings.timeout_runner and runner_group.is_full is False:
74+
log.info(f"Runner group {runner_group.name} needs a new runner")
75+
github: GitHub = get_github()
76+
runner_group.create_runner(github)
5877
return runner.pk
5978

6079

runner_manager/models/runner_group.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,11 @@ def need_new_runner(self) -> bool:
219219
count = len(runners)
220220
return (not_active < self.min or self.queued > 0) and count < self.max
221221

222+
@property
223+
def is_full(self) -> bool:
224+
"""Return True if the max number of runners has been reached."""
225+
return len(self.get_runners()) >= self.max
226+
222227
def create_github_group(self, github: GitHub) -> GitHubRunnerGroup:
223228
"""Create a GitHub runner group."""
224229

runner_manager/models/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class Settings(BaseSettings):
3131
api_key: Optional[SecretStr] = None
3232
log_level: Literal["INFO", "WARNING", "DEBUG", "ERROR"] = "INFO"
3333
runner_groups: List[BaseRunnerGroup] = []
34-
timeout_runner: Optional[timedelta] = timedelta(minutes=15)
34+
timeout_runner: timedelta = timedelta(minutes=15)
3535
time_to_live: Optional[timedelta] = timedelta(hours=12)
3636
healthcheck_interval: timedelta = timedelta(minutes=15)
3737
github_base_url: Optional[AnyHttpUrl] = Field(default="https://api.github.com")

tests/strategies.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class Repo(Repository):
8080
runner_group_id=Int,
8181
labels=st.lists(Text, min_size=1, max_size=5),
8282
started_at=st.datetimes(),
83+
created_at=st.datetimes(),
8384
)
8485

8586
JobPropQueuedStrategy = st.builds(
@@ -127,7 +128,7 @@ class Repo(Repository):
127128
github_base_url=st.just("http://localhost:4010"),
128129
github_token=st.just("test"),
129130
time_to_live=st.integers(1, 60),
130-
timeout_runner=st.integers(1, 10),
131+
timeout_runner=st.integers(120, 600),
131132
)
132133

133134
RedisStrategy = st.builds(

tests/unit/jobs/test_workflow_job.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import timedelta
12
from time import sleep
23
from uuid import uuid4
34

@@ -103,7 +104,6 @@ def test_workflow_job_completed(
103104
def test_workflow_job_in_progress(
104105
webhook: WorkflowJobInProgress, queue: Queue, settings: Settings, redis: Redis
105106
):
106-
107107
# flush all keys that start with settings.name in redis
108108

109109
init_model(Runner, redis, settings)
@@ -197,3 +197,61 @@ def test_workflow_job_queued(
197197
).first()
198198
assert runner.busy is False
199199
assert runner.status == "offline"
200+
201+
202+
@settings(max_examples=10)
203+
@given(
204+
webhook=WorkflowJobInProgressStrategy,
205+
queue=QueueStrategy,
206+
settings=SettingsStrategy,
207+
redis=RedisStrategy,
208+
)
209+
def test_workflow_job_time_to_start(
210+
webhook: WorkflowJobInProgress, queue: Queue, settings: Settings, redis: Redis
211+
):
212+
"""
213+
This test will ensure that an extra runner is created when the time to start
214+
the given workflow was higher than settings.timeout_runners.
215+
"""
216+
init_model(Runner, redis, settings)
217+
init_model(RunnerGroup, redis, settings)
218+
runner_group: RunnerGroup = RunnerGroup(
219+
organization=webhook.organization.login,
220+
name=webhook.workflow_job.runner_group_name,
221+
id=webhook.workflow_job.runner_group_id,
222+
labels=webhook.workflow_job.labels,
223+
manager=settings.name,
224+
backend={"name": "base"},
225+
max=2,
226+
)
227+
runner_group.save()
228+
Migrator().run()
229+
230+
runner: Runner = Runner(
231+
id=webhook.workflow_job.runner_id,
232+
name=webhook.workflow_job.runner_name,
233+
busy=False,
234+
status="online",
235+
manager=settings.name,
236+
runner_group_id=webhook.workflow_job.runner_group_id,
237+
runner_group_name=webhook.workflow_job.runner_group_name,
238+
)
239+
runner.save()
240+
Migrator().run()
241+
242+
assert len(runner_group.get_runners()) == 1
243+
# ensure we have only one runner if the time to start is less than timeout_runners
244+
webhook.workflow_job.started_at = webhook.workflow_job.created_at + (
245+
settings.timeout_runner - timedelta(minutes=15)
246+
)
247+
queue.enqueue(workflow_job.in_progress, webhook)
248+
assert len(runner_group.get_runners()) == 1
249+
# ensure we have two runners if the time to start is greater than timeout_runners
250+
webhook.workflow_job.started_at = webhook.workflow_job.created_at + (
251+
settings.timeout_runner + timedelta(minutes=15)
252+
)
253+
queue.enqueue(workflow_job.in_progress, webhook)
254+
assert len(runner_group.get_runners()) == 2
255+
# ensure we remain with two runners given that the max for the runner group is 2
256+
queue.enqueue(workflow_job.in_progress, webhook)
257+
assert len(runner_group.get_runners()) == 2

tests/unit/models/test_runner_group.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,14 @@ def test_find_github_group(runner_group: RunnerGroup, github: GitHub):
214214
assert exists is not None
215215
group = runner_group.save(github=github)
216216
assert exists.id == group.id
217+
218+
219+
def test_is_full(runner_group: RunnerGroup, github: GitHub):
220+
runner_group.max = 2
221+
runner_group.min = 1
222+
runner_group.save()
223+
assert runner_group.is_full is False
224+
runner_group.create_runner(github)
225+
assert runner_group.is_full is False
226+
runner_group.create_runner(github)
227+
assert runner_group.is_full is True

0 commit comments

Comments
 (0)