Skip to content

Commit 993f82a

Browse files
authored
Add last crawl's stats object to CrawlConfigOut (#2714)
Fixes #2709 Will allow us to display information about page counts (found, done) in the workflow list.
1 parent 89027ef commit 993f82a

File tree

3 files changed

+30
-9
lines changed

3 files changed

+30
-9
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
852852
update_query["lastCrawlSize"] = sum(
853853
file_.get("size", 0) for file_ in last_crawl.get("files", [])
854854
)
855+
update_query["lastCrawlStats"] = last_crawl.get("stats")
855856
update_query["lastCrawlStopping"] = False
856857
update_query["isCrawlRunning"] = False
857858

@@ -866,6 +867,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
866867
update_query["lastCrawlTime"] = None
867868
update_query["lastCrawlState"] = None
868869
update_query["lastCrawlSize"] = 0
870+
update_query["lastCrawlStats"] = None
869871
update_query["lastRun"] = None
870872
update_query["isCrawlRunning"] = False
871873

@@ -895,6 +897,7 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
895897
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
896898
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
897899
crawlconfig.lastCrawlPausedExpiry = None
900+
crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
898901
if crawl.pausedAt:
899902
crawlconfig.lastCrawlPausedExpiry = (
900903
crawl.pausedAt + self.paused_expiry_delta
@@ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
14201423
update_query["lastStartedByName"] = last_crawl.get("userName")
14211424
update_query["lastCrawlState"] = last_crawl.get("state")
14221425
update_query["lastCrawlSize"] = last_crawl_size
1426+
update_query["lastCrawlStats"] = last_crawl.get("stats")
14231427
update_query["lastCrawlStopping"] = False
14241428
update_query["isCrawlRunning"] = False
14251429

backend/btrixcloud/models.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,15 @@ class UserOrgInfoOut(BaseModel):
273273
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
274274

275275

276+
# ============================================================================
277+
class CrawlStats(BaseModel):
278+
"""Crawl Stats for pages and size"""
279+
280+
found: int = 0
281+
done: int = 0
282+
size: int = 0
283+
284+
276285
# ============================================================================
277286

278287
### CRAWL CONFIGS ###
@@ -510,6 +519,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
510519
lastCrawlShouldPause: Optional[bool] = False
511520
lastCrawlPausedAt: Optional[datetime] = None
512521
lastCrawlPausedExpiry: Optional[datetime] = None
522+
lastCrawlStats: Optional[CrawlStats] = None
513523
profileName: Optional[str] = None
514524

515525
createdByName: Optional[str] = None
@@ -772,15 +782,6 @@ class CrawlFileOut(BaseModel):
772782
expireAt: Optional[str] = None
773783

774784

775-
# ============================================================================
776-
class CrawlStats(BaseModel):
777-
"""Crawl Stats for pages and size"""
778-
779-
found: int = 0
780-
done: int = 0
781-
size: int = 0
782-
783-
784785
# ============================================================================
785786
class CoreCrawlable(BaseModel):
786787
# pylint: disable=too-few-public-methods

backend/test/test_crawlconfigs.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats(
520520
assert workflow["lastRun"]
521521
assert workflow["lastCrawlSize"] > 0
522522

523+
stats = workflow["lastCrawlStats"]
524+
assert stats["found"] > 0
525+
assert stats["done"] > 0
526+
assert stats["size"] > 0
527+
523528
if last_crawl_id == admin_crawl_id:
524529
global _admin_crawl_cid
525530
_admin_crawl_cid = workflow["id"]
@@ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats(
545550
assert data["lastRun"]
546551
assert data["lastCrawlSize"] > 0
547552

553+
stats = data["lastCrawlStats"]
554+
assert stats["found"] > 0
555+
assert stats["done"] > 0
556+
assert stats["size"] > 0
557+
548558

549559
def test_incremental_workflow_total_size_and_last_crawl_stats(
550560
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
@@ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
564574
last_crawl_started = data["lastCrawlStartTime"]
565575
last_crawl_finished = data["lastCrawlTime"]
566576
last_run = data["lastRun"]
577+
last_stats = data["lastCrawlStats"]
567578

568579
# Run new crawl in this workflow
569580
r = requests.post(
@@ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
602613
assert data["lastCrawlStartTime"] > last_crawl_started
603614
assert data["lastCrawlTime"] > last_crawl_finished
604615
assert data["lastRun"] > last_run
616+
stats = data["lastCrawlStats"]
617+
assert stats["found"] > 0
618+
assert stats["done"] > 0
619+
assert stats["size"] > 0
605620

606621
# Delete new crawl
607622
r = requests.post(
@@ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
628643
assert data["lastCrawlStartTime"] == last_crawl_started
629644
assert data["lastCrawlTime"] == last_crawl_finished
630645
assert data["lastRun"] == last_run
646+
assert data["lastCrawlStats"] == last_stats
631647

632648

633649
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):

0 commit comments

Comments
 (0)