Skip to content

Commit d58747d

Browse files
tw4likreymer
andauthored
Provide full resources in archived items finished webhooks (#1308)
Fixes #1306 - Include full `resources` with expireAt (as string) in crawlFinished and uploadFinished webhook notifications rather than using the `downloadUrls` field (this is retained for collections). - Set default presigned duration to one minute short of 1 week and enforce maximum supported by S3 - Add 'storage_presign_duration_minutes' commented out to helm values.yaml - Update tests --------- Co-authored-by: Ilya Kreymer <[email protected]>
1 parent 2e5952a commit d58747d

File tree

6 files changed

+48
-33
lines changed

6 files changed

+48
-33
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
""" base crawl type """
22

3-
import asyncio
43
import uuid
54
import os
65
from datetime import timedelta
@@ -44,6 +43,12 @@
4443
ALL_CRAWL_STATES = (*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES)
4544

4645

46+
# Presign duration must be less than 604800 seconds (one week),
47+
# so set this one minute short of a week.
48+
PRESIGN_MINUTES_MAX = 10079
49+
PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX
50+
51+
4752
# ============================================================================
4853
# pylint: disable=too-many-instance-attributes
4954
class BaseCrawlOps:
@@ -62,8 +67,12 @@ def __init__(
6267
self.colls = colls
6368
self.storage_ops = storage_ops
6469

70+
presign_duration_minutes = int(
71+
os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT
72+
)
73+
6574
self.presign_duration_seconds = (
66-
int(os.environ.get("PRESIGN_DURATION_MINUTES", 60)) * 60
75+
min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60
6776
)
6877

6978
async def get_crawl_raw(
@@ -362,7 +371,6 @@ async def _resolve_signed_urls(
362371

363372
delta = timedelta(seconds=self.presign_duration_seconds)
364373

365-
updates = []
366374
out_files = []
367375

368376
for file_ in files:
@@ -374,17 +382,20 @@ async def _resolve_signed_urls(
374382
presigned_url = await self.storage_ops.get_presigned_url(
375383
org, file_, self.presign_duration_seconds
376384
)
377-
updates.append(
378-
(
379-
{"files.filename": file_.filename},
380-
{
381-
"$set": {
382-
"files.$.presignedUrl": presigned_url,
383-
"files.$.expireAt": exp,
384-
}
385-
},
386-
)
385+
await self.crawls.find_one_and_update(
386+
{"files.filename": file_.filename},
387+
{
388+
"$set": {
389+
"files.$.presignedUrl": presigned_url,
390+
"files.$.expireAt": exp,
391+
}
392+
},
387393
)
394+
file_.expireAt = exp
395+
396+
expire_at_str = ""
397+
if file_.expireAt:
398+
expire_at_str = file_.expireAt.isoformat()
388399

389400
out_files.append(
390401
CrawlFileOut(
@@ -393,20 +404,12 @@ async def _resolve_signed_urls(
393404
hash=file_.hash,
394405
size=file_.size,
395406
crawlId=crawl_id,
407+
expireAt=expire_at_str,
396408
)
397409
)
398410

399-
if updates:
400-
asyncio.create_task(self._update_presigned(updates))
401-
402-
# print("presigned", out_files)
403-
404411
return out_files
405412

406-
async def _update_presigned(self, updates):
407-
for update in updates:
408-
await self.crawls.find_one_and_update(*update)
409-
410413
@contextlib.asynccontextmanager
411414
async def get_redis(self, crawl_id):
412415
"""get redis url for crawl id"""

backend/btrixcloud/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,9 @@ class CrawlFileOut(BaseModel):
406406
path: str
407407
hash: str
408408
size: int
409+
409410
crawlId: Optional[str]
411+
expireAt: Optional[str]
410412

411413

412414
# ============================================================================
@@ -1053,6 +1055,7 @@ class BaseArchivedItemBody(WebhookNotificationBody):
10531055
"""Webhook notification POST body for when archived item is started or finished"""
10541056

10551057
itemId: str
1058+
resources: Optional[List[CrawlFileOut]] = None
10561059

10571060

10581061
# ============================================================================

backend/btrixcloud/webhooks.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,7 @@ async def _create_item_finished_notification(
189189
print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True)
190190
return
191191

192-
download_urls = []
193-
for resource in crawl.resources:
194-
download_url = f"{org.origin}{resource.path}"
195-
download_urls.append(download_url)
196-
197-
body.downloadUrls = download_urls
192+
body.resources = crawl.resources
198193

199194
notification = WebhookNotification(
200195
id=uuid.uuid4(),

backend/test/test_webhooks.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,17 @@ def test_get_webhook_event(admin_auth_headers, default_org_id):
8181
assert event
8282

8383
if event in ("crawlFinished", "uploadFinished"):
84-
assert len(body["downloadUrls"]) >= 1
84+
assert len(body["resources"]) >= 1
85+
assert len(body.get("downloadUrls", [])) == 0
8586
assert body["itemId"]
8687

8788
elif event in ("crawlStarted"):
88-
assert len(body["downloadUrls"]) == 0
89+
assert len(body.get("resources", [])) == 0
90+
assert len(body.get("downloadUrls", [])) == 0
8991
assert body["itemId"]
9092

9193
elif event in ("addedToCollection", "removedFromCollection"):
94+
assert len(body.get("resources", [])) == 0
9295
assert len(body["downloadUrls"]) == 1
9396
assert body["collectionId"]
9497
assert len(body["itemIds"]) >= 1
@@ -246,28 +249,33 @@ def test_webhooks_sent(
246249
assert post["itemId"]
247250
assert post["scheduled"] in (True, False)
248251
assert post.get("downloadUrls") is None
252+
assert post.get("resources") is None
249253

250254
elif event == "crawlFinished":
251255
crawl_finished_count += 1
252256
assert post["itemId"]
253257
assert post["state"]
254-
assert post["downloadUrls"]
258+
assert post["resources"]
259+
assert post.get("downloadUrls") is None
255260

256261
elif event == "uploadFinished":
257262
upload_finished_count += 1
258263
assert post["itemId"]
259264
assert post["state"]
260-
assert post["downloadUrls"]
265+
assert post["resources"]
266+
assert post.get("downloadUrls") is None
261267

262268
elif event == "addedToCollection":
263269
added_to_collection_count += 1
264270
assert post["downloadUrls"] and len(post["downloadUrls"]) == 1
271+
assert post.get("resources") is None
265272
assert post["itemIds"]
266273
assert post["collectionId"]
267274

268275
elif event == "removedFromCollection":
269276
removed_from_collection_count += 1
270277
assert post["downloadUrls"] and len(post["downloadUrls"]) == 1
278+
assert post.get("resources") is None
271279
assert post["itemIds"]
272280
assert post["collectionId"]
273281

chart/templates/configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ data:
3636

3737
RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
3838

39-
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes | default 60 }}"
39+
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}"
4040

4141
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"
4242

chart/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,12 @@ storages:
276276
# shared_storage_profile:
277277

278278

279+
# optional: duration in minutes for WACZ download links to be valid
280+
# used by webhooks and replay
281+
# max value = 10079 (one week minus one minute)
282+
# storage_presign_duration_minutes: 10079
283+
284+
279285
# Email Options
280286
# =========================================
281287
email:

0 commit comments

Comments
 (0)