webrecorder
diff --git a/‎backend/btrixcloud/colls.py‎
Lines changed: 99 additions & 13 deletions b/‎backend/btrixcloud/colls.py‎
Lines changed: 99 additions & 13 deletions
diff --git a/‎backend/btrixcloud/crawlconfigs.py‎
Lines changed: 62 additions & 3 deletions b/‎backend/btrixcloud/crawlconfigs.py‎
Lines changed: 62 additions & 3 deletions
@@ -49,6 +49,7 @@
     UserFilePreparer,
     MIN_UPLOAD_PART_SIZE,
     PublicCollOut,
+    ResourcesOnly,
 )
 from .utils import (
     dt_now,
@@ -57,6 +58,8 @@
     get_origin,
 )
 
+from .crawlmanager import CrawlManager
+
 if TYPE_CHECKING:
     from .orgs import OrgOps
     from .storages import StorageOps
@@ -81,8 +84,16 @@ class CollectionOps:
     event_webhook_ops: EventWebhookOps
     crawl_ops: CrawlOps
     page_ops: PageOps
+    crawl_manager: CrawlManager
 
-    def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
+    def __init__(
+        self,
+        mdb,
+        orgs: OrgOps,
+        storage_ops: StorageOps,
+        crawl_manager: CrawlManager,
+        event_webhook_ops: EventWebhookOps,
+    ):
         self.collections = mdb["collections"]
         self.crawls = mdb["crawls"]
         self.crawl_configs = mdb["crawl_configs"]
@@ -91,6 +102,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
 
         self.orgs = orgs
         self.storage_ops = storage_ops
+        self.crawl_manager = crawl_manager
         self.event_webhook_ops = event_webhook_ops
 
     def set_crawl_ops(self, ops):
@@ -141,11 +153,15 @@ async def add_collection(self, oid: UUID, coll_in: CollIn):
             access=coll_in.access,
             defaultThumbnailName=coll_in.defaultThumbnailName,
             allowPublicDownload=coll_in.allowPublicDownload,
+            hasDedupeIndex=coll_in.hasDedupeIndex,
         )
         try:
             await self.collections.insert_one(coll.to_dict())
             org = await self.orgs.get_org_by_id(oid)
             await self.clear_org_previous_slugs_matching_slug(slug, org)
+            # create collection index
+            if coll.hasDedupeIndex:
+                await self.crawl_manager.create_coll_index(coll)
 
             if crawl_ids:
                 await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
@@ -194,22 +210,33 @@ async def update_collection(
             db_update["$push"] = {"previousSlugs": previous_slug}
 
         try:
-            result = await self.collections.find_one_and_update(
+            prev_result = await self.collections.find_one_and_update(
                 {"_id": coll_id, "oid": org.id},
                 db_update,
-                return_document=pymongo.ReturnDocument.AFTER,
+                return_document=pymongo.ReturnDocument.BEFORE,
             )
         except pymongo.errors.DuplicateKeyError as err:
             # pylint: disable=raise-missing-from
             field = get_duplicate_key_error_field(err)
             raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
 
-        if not result:
+        if not prev_result:
             raise HTTPException(status_code=404, detail="collection_not_found")
 
         if slug_update:
             await self.clear_org_previous_slugs_matching_slug(slug_update, org)
 
+        # if dedupe index is true, but was false
+        if update.hasDedupeIndex and not prev_result.get("hasDedupeIndex"):
+            # get latest coll, create index
+            coll = await self.get_collection(coll_id, org.id)
+            await self.crawl_manager.create_coll_index(coll)
+
+        # if dedupe is false, but was true
+        if update.hasDedupeIndex is False and prev_result.get("hasDedupeIndex"):
+            # delete index -- may need extra restrictions
+            await self.crawl_manager.delete_coll_index(coll_id)
+
         return {"updated": True}
 
     async def clear_org_previous_slugs_matching_slug(
@@ -221,6 +248,16 @@ async def clear_org_previous_slugs_matching_slug(
             {"$pull": {"previousSlugs": slug}},
         )
 
+    async def get_coll_dedupe_index(self, coll_id: UUID) -> bool:
+        """return true/false if collection has dedupe index, or raise"""
+        result = await self.collections.find_one(
+            {"_id": coll_id}, projection=["hasDedupeIndex"]
+        )
+        if not result:
+            raise HTTPException(status_code=404, detail="collection_not_found")
+
+        return result["hasDedupeIndex"] is True
+
     async def add_crawls_to_collection(
         self,
         coll_id: UUID,
@@ -229,8 +266,6 @@ async def add_crawls_to_collection(
         headers: Optional[dict] = None,
     ) -> CollOut:
         """Add crawls to collection"""
-        await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
-
         modified = dt_now()
         result = await self.collections.find_one_and_update(
             {"_id": coll_id},
@@ -240,8 +275,11 @@ async def add_crawls_to_collection(
         if not result:
             raise HTTPException(status_code=404, detail="collection_not_found")
 
+        # do this after checking if collection exists
+        await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
+
         await self.update_collection_counts_and_tags(coll_id)
-        await self.update_collection_dates(coll_id, org.id)
+        await self.update_collection_dates(coll_id, org.id, update_index=True)
 
         asyncio.create_task(
             self.event_webhook_ops.create_added_to_collection_notification(
@@ -270,7 +308,7 @@ async def remove_crawls_from_collection(
             raise HTTPException(status_code=404, detail="collection_not_found")
 
         await self.update_collection_counts_and_tags(coll_id)
-        await self.update_collection_dates(coll_id, org.id)
+        await self.update_collection_dates(coll_id, org.id, update_index=True)
 
         asyncio.create_task(
             self.event_webhook_ops.create_removed_from_collection_notification(
@@ -294,6 +332,24 @@ async def get_collection_raw(
 
         return result
 
+    async def enable_dedupe_index(self, coll_id: UUID):
+        """enable dedupe index if it doesn't exist yet"""
+        result = await self.collections.find_one_and_update(
+            {"_id": coll_id, "hasDedupeIndex": {"$ne": True}},
+            {"$set": {"hasDedupeIndex": True}},
+            return_document=pymongo.ReturnDocument.AFTER,
+        )
+
+        # not changed, nothing to do
+        if not result:
+            return False
+
+        coll = Collection.from_dict(result)
+
+        await self.crawl_manager.create_coll_index(coll)
+
+        return True
+
     async def get_collection_raw_by_slug(
         self,
         coll_slug: str,
@@ -396,6 +452,16 @@ async def get_collection_out(
 
         return CollOut.from_dict(result)
 
+    async def get_internal_replay_list(self, coll_id: UUID, oid: UUID) -> ResourcesOnly:
+        """get list of internally resolved signed WACZ files"""
+        org = await self.orgs.get_org_by_id(oid)
+        resources, _, _ = await self.get_collection_crawl_resources(coll_id, org)
+
+        for file_ in resources:
+            file_.path = self.storage_ops.resolve_internal_access_path(file_.path)
+
+        return ResourcesOnly(resources=resources)
+
     async def get_public_collection_out(
         self,
         coll_id: UUID,
@@ -639,6 +705,9 @@ async def delete_collection(self, coll_id: UUID, org: Organization):
         if coll.thumbnail:
             await self.delete_thumbnail(coll_id, org)
 
+        if coll.hasDedupeIndex:
+            await self.crawl_manager.delete_coll_index(coll.id)
+
         result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
         if result.deleted_count < 1:
             raise HTTPException(status_code=404, detail="collection_not_found")
@@ -740,7 +809,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
             },
         )
 
-    async def update_collection_dates(self, coll_id: UUID, oid: UUID):
+    async def update_collection_dates(
+        self, coll_id: UUID, oid: UUID, update_index=False
+    ):
         """Update collection earliest and latest dates from page timestamps"""
         # pylint: disable=too-many-locals
         coll = await self.get_collection(coll_id, oid)
@@ -749,6 +820,10 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
         earliest_ts = None
         latest_ts = None
 
+        # update_index is set, update dedupe index if it exists
+        if update_index and coll.hasDedupeIndex:
+            await self.crawl_manager.update_coll_index(coll_id)
+
         match_query = {
             "oid": coll.oid,
             "crawl_id": {"$in": crawl_ids},
@@ -783,13 +858,16 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
 
     async def update_crawl_collections(self, crawl_id: str, oid: UUID):
         """Update counts, dates, and modified for all collections in crawl"""
+        # accessing directly to handle both crawls and uploads
         crawl = await self.crawls.find_one({"_id": crawl_id})
-        crawl_coll_ids = crawl.get("collectionIds")
+        crawl_coll_ids = crawl.get("collectionIds") or []
         modified = dt_now()
 
         for coll_id in crawl_coll_ids:
             await self.update_collection_counts_and_tags(coll_id)
-            await self.update_collection_dates(coll_id, oid)
+            await self.update_collection_dates(
+                coll_id, oid, crawl.get("dedupeCollId") != coll_id
+            )
             await self.collections.find_one_and_update(
                 {"_id": coll_id},
                 {"$set": {"modified": modified}},
@@ -1000,12 +1078,20 @@ async def calculate_thumbnail_storage(self, oid: UUID) -> int:
 # ============================================================================
 # pylint: disable=too-many-locals
 def init_collections_api(
-    app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
+    app,
+    mdb,
+    orgs: OrgOps,
+    storage_ops: StorageOps,
+    crawl_manager: CrawlManager,
+    event_webhook_ops: EventWebhookOps,
+    user_dep,
 ) -> CollectionOps:
     """init collections api"""
     # pylint: disable=invalid-name, unused-argument, too-many-arguments
 
-    colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
+    colls: CollectionOps = CollectionOps(
+        mdb, orgs, storage_ops, crawl_manager, event_webhook_ops
+    )
 
     org_crawl_dep = orgs.org_crawl_dep
     org_viewer_dep = orgs.org_viewer_dep
 
@@ -4,7 +4,17 @@
 
 # pylint: disable=too-many-lines
 
-from typing import List, Optional, TYPE_CHECKING, cast, Dict, Tuple, Annotated, Union
+from typing import (
+    List,
+    Optional,
+    TYPE_CHECKING,
+    cast,
+    Dict,
+    Tuple,
+    Annotated,
+    Union,
+    Any,
+)
 
 import asyncio
 import json
@@ -319,6 +329,14 @@ async def add_crawl_config(
 
             first_seed = seeds[0].url
 
+        # the dedupe collection id must also be in auto add collections
+        if config_in.dedupeCollId:
+            if config_in.autoAddCollections is None:
+                config_in.autoAddCollections = []
+
+            if config_in.dedupeCollId not in config_in.autoAddCollections:
+                config_in.autoAddCollections.append(config_in.dedupeCollId)
+
         now = dt_now()
         crawlconfig = CrawlConfig(
             id=uuid4(),
@@ -346,6 +364,7 @@ async def add_crawl_config(
             firstSeed=first_seed,
             seedCount=seed_count,
             shareable=config_in.shareable,
+            dedupeCollId=config_in.dedupeCollId,
         )
 
         if config_in.runNow:
@@ -362,6 +381,9 @@ async def add_crawl_config(
         storage_quota_reached = False
         exec_mins_quota_reached = False
 
+        if config_in.dedupeCollId:
+            await self.coll_ops.enable_dedupe_index(config_in.dedupeCollId)
+
         if config_in.runNow:
             try:
                 crawl_id = await self.run_now_internal(crawlconfig, org, user)
@@ -605,6 +627,26 @@ async def update_crawl_config(
             update.tags is not None
             and ",".join(orig_crawl_config.tags) != ",".join(update.tags)
         )
+
+        metadata_changed = metadata_changed or (
+            update.dedupeCollId is not None
+            and update.dedupeCollId != orig_crawl_config.dedupeCollId
+        )
+
+        if isinstance(update.dedupeCollId, UUID):
+            dedupe_coll_id = update.dedupeCollId
+        elif update.dedupeCollId == "":
+            dedupe_coll_id = None
+        else:
+            dedupe_coll_id = orig_crawl_config.dedupeCollId
+
+        if (
+            dedupe_coll_id
+            and update.autoAddCollections is not None
+            and dedupe_coll_id not in update.autoAddCollections
+        ):
+            update.autoAddCollections.append(dedupe_coll_id)
+
         metadata_changed = metadata_changed or (
             update.autoAddCollections is not None
             and sorted(orig_crawl_config.autoAddCollections)
@@ -632,14 +674,22 @@ async def update_crawl_config(
         query["modifiedByName"] = user.name
         query["modified"] = dt_now()
 
-        # if empty str, just clear the profile
+        # profile - if empty str, just clear the profile
         if update.profileid == "":
             query["profileid"] = None
         # else, ensure its a valid profile
         elif update.profileid:
             await self.profiles.get_profile(cast(UUID, update.profileid), org)
             query["profileid"] = update.profileid
 
+        # dedupe - if empty dedupeCollId, clear the coll id
+        if update.dedupeCollId == "":
+            query["dedupeCollId"] = None
+        # else, enable dedupe on collection
+        if isinstance(update.dedupeCollId, UUID):
+            query["dedupeCollId"] = update.dedupeCollId
+            await self.coll_ops.enable_dedupe_index(update.dedupeCollId)
+
         if update.config is not None:
             query["config"] = update.config.dict()
 
@@ -654,10 +704,15 @@ async def update_crawl_config(
             query["seedCount"] = len(update.config.seeds)
             query["seedFileId"] = None
 
+        update_query: dict[str, Any] = {"$set": query, "$inc": {"rev": 1}}
+        # only add here if not setting autoAddCollections
+        if dedupe_coll_id and "autoAddCollections" not in query:
+            update_query["$addToSet"] = {"autoAddCollections": dedupe_coll_id}
+
         # update in db
         result = await self.crawl_configs.find_one_and_update(
             {"_id": cid, "inactive": {"$ne": True}},
-            {"$set": query, "$inc": {"rev": 1}},
+            update_query,
             return_document=pymongo.ReturnDocument.AFTER,
         )
 
@@ -1123,6 +1178,10 @@ async def remove_collection_from_all_configs(
             {"$pull": {"autoAddCollections": coll_id}},
         )
 
+        await self.crawl_configs.update_many(
+            {"oid": org.id, "dedupeCollId": coll_id}, {"$set": {"dedupeCollId": None}}
+        )
+
     async def get_crawl_config_tags(self, org):
         """get distinct tags from all crawl configs for this org"""
         return await self.crawl_configs.distinct("tags", {"oid": org.id})