Skip to content

Commit b4fd5e6

Browse files
ikreymertw4l
andauthored
Crawl Timeout via elapsed time (#1338)
Fixes #1337 Crawl timeout is tracked via `elapsedCrawlTime` field on the crawl status, which is similar to regular crawl execution time, but only counts one pod if scale > 1. If scale == 1, this time is equivalent. Crawl is gracefully stopped when the elapsed execution time exceeds the timeout. For more responsiveness, also adding current crawl time since last update interval. Details: - handle crawl timeout via elapsed crawl time - longest running time of a single pod, instead of expire time. - include current running from last update for best precision - more accurately count elapsed time crawl is actually running - store elapsedCrawlTime in addition to crawlExecTime, storing the longest duration of each pod since last test interval --------- Co-authored-by: Tessa Walsh <[email protected]>
1 parent 5530ca9 commit b4fd5e6

File tree

4 files changed

+31
-24
lines changed

4 files changed

+31
-24
lines changed

backend/btrixcloud/k8sapi.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
import os
33
import traceback
44

5-
from datetime import timedelta
6-
75
import yaml
86

97
from kubernetes_asyncio import client, config
@@ -18,7 +16,7 @@
1816
from fastapi import HTTPException
1917
from fastapi.templating import Jinja2Templates
2018

21-
from .utils import get_templates_dir, dt_now, to_k8s_date
19+
from .utils import get_templates_dir, dt_now
2220

2321

2422
# ============================================================================
@@ -85,11 +83,6 @@ def new_crawl_job_yaml(
8583
crawl_id=None,
8684
):
8785
"""load job template from yaml"""
88-
if crawl_timeout:
89-
crawl_expire_time = to_k8s_date(dt_now() + timedelta(seconds=crawl_timeout))
90-
else:
91-
crawl_expire_time = ""
92-
9386
if not crawl_id:
9487
ts_now = dt_now().strftime("%Y%m%d%H%M%S")
9588
prefix = "manual" if manual else "sched"
@@ -101,7 +94,7 @@ def new_crawl_job_yaml(
10194
"oid": oid,
10295
"userid": userid,
10396
"scale": scale,
104-
"expire_time": crawl_expire_time or 0,
97+
"timeout": crawl_timeout,
10598
"max_crawl_size": max_crawl_size or 0,
10699
"storage_name": str(storage),
107100
"manual": "1" if manual else "0",

backend/btrixcloud/operator.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from collections import defaultdict
1010

11-
from datetime import datetime
1211
import json
1312
from uuid import UUID
1413
from fastapi import HTTPException
@@ -114,8 +113,8 @@ class CrawlSpec(BaseModel):
114113
started: str
115114
stopping: bool = False
116115
scheduled: bool = False
117-
expire_time: Optional[datetime] = None
118-
max_crawl_size: Optional[int] = None
116+
timeout: int = 0
117+
max_crawl_size: int = 0
119118

120119

121120
# ============================================================================
@@ -232,9 +231,15 @@ class CrawlStatus(BaseModel):
232231
restartTime: Optional[str]
233232
canceled: bool = False
234233

235-
# Execution Time -- updated on pod exits and at regular interval
234+
# updated on pod exits and at regular interval
235+
# Crawl Execution Time -- time all crawler pods have been running
236+
# used to track resource usage and enforce execution minutes limit
236237
crawlExecTime: int = 0
237238

239+
# Elapsed Exec Time -- time crawl has been running in at least one pod
240+
# used for crawl timeouts
241+
elapsedCrawlTime: int = 0
242+
238243
# last exec time update
239244
lastUpdatedTime: str = ""
240245

@@ -440,7 +445,7 @@ async def sync_crawls(self, data: MCSyncData):
440445
scale=spec.get("scale", 1),
441446
started=data.parent["metadata"]["creationTimestamp"],
442447
stopping=spec.get("stopping", False),
443-
expire_time=from_k8s_date(spec.get("expireTime")),
448+
timeout=spec.get("timeout") or 0,
444449
max_crawl_size=int(spec.get("maxCrawlSize") or 0),
445450
scheduled=spec.get("manual") != "1",
446451
)
@@ -1081,6 +1086,7 @@ async def increment_pod_exec_time(
10811086
return
10821087

10831088
exec_time = 0
1089+
max_duration = 0
10841090
print(
10851091
f"Exec Time Update: {reason}: {now} - {update_start_time} = {update_duration}"
10861092
)
@@ -1131,11 +1137,13 @@ async def increment_pod_exec_time(
11311137
f" - {name}: {pod_state}: {end_time} - {start_time} = {duration}"
11321138
)
11331139
exec_time += duration
1140+
max_duration = max(duration, max_duration)
11341141

11351142
if exec_time:
11361143
await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
11371144
await self.org_ops.inc_org_time_stats(oid, exec_time, True)
11381145
status.crawlExecTime += exec_time
1146+
status.elapsedCrawlTime += max_duration
11391147

11401148
print(
11411149
f" Exec Time Total: {status.crawlExecTime}, Incremented By: {exec_time}",
@@ -1254,20 +1262,27 @@ async def add_file_to_crawl(self, cc_data, crawl, redis):
12541262

12551263
return True
12561264

1257-
def is_crawl_stopping(self, crawl, size):
1265+
def is_crawl_stopping(self, crawl: CrawlSpec, status: CrawlStatus) -> bool:
12581266
"""return true if crawl should begin graceful stopping phase"""
12591267

12601268
# if user requested stop, then enter stopping phase
12611269
if crawl.stopping:
12621270
print("Graceful Stop: User requested stop")
12631271
return True
12641272

1265-
# check crawl expiry
1266-
if crawl.expire_time and dt_now() > crawl.expire_time:
1267-
print(f"Graceful Stop: Job duration expired at {crawl.expire_time}")
1268-
return True
1273+
# check timeout if timeout time exceeds elapsed time
1274+
if crawl.timeout:
1275+
elapsed = (
1276+
status.elapsedCrawlTime
1277+
+ (dt_now() - from_k8s_date(status.lastUpdatedTime)).total_seconds()
1278+
)
1279+
if elapsed > crawl.timeout:
1280+
print(
1281+
f"Graceful Stop: Crawl running time exceeded {crawl.timeout} second timeout"
1282+
)
1283+
return True
12691284

1270-
if crawl.max_crawl_size and size > crawl.max_crawl_size:
1285+
if crawl.max_crawl_size and status.size > crawl.max_crawl_size:
12711286
print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit")
12721287
return True
12731288

@@ -1311,7 +1326,7 @@ async def update_crawl_state(self, redis, crawl, status, pods, done) -> CrawlSta
13111326
pod_info = status.podStatus[key]
13121327
pod_info.used.storage = value
13131328

1314-
status.stopping = self.is_crawl_stopping(crawl, status.size)
1329+
status.stopping = self.is_crawl_stopping(crawl, status)
13151330

13161331
# check exec time quotas and stop if reached limit
13171332
if not status.stopping:

backend/test_nightly/test_upload_replicas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
curr_dir = os.path.dirname(os.path.realpath(__file__))
1414

15+
1516
def test_upload_stream(admin_auth_headers, default_org_id):
1617
with open(os.path.join(curr_dir, "..", "test", "data", "example.wacz"), "rb") as fh:
1718
r = requests.put(

chart/app-templates/crawl_job.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@ spec:
2020
oid: "{{ oid }}"
2121
scale: {{ scale }}
2222
maxCrawlSize: {{ max_crawl_size }}
23+
timeout: {{ timeout }}
2324
manual: {{ manual }}
2425
ttlSecondsAfterFinished: 30
2526

2627
storageName: "{{ storage_name }}"
2728

28-
{% if expire_time %}
29-
expireTime: "{{ expire_time }}"
30-
{% endif %}

0 commit comments

Comments
 (0)