Feature/pdct 1628 handle slugs for documents with the same title (#237)

* Handle duplicate titles when creating slugs during bulk import * Refactor to surface commit errors better * Increase suffix length to 6 * Bump patch version * Update poetry.lock file
climatepolicyradar · Oct 31, 2024 · 969d166 · 969d166
1 parent 0c67391
commit 969d166
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 41 deletions.
diff --git a/app/repository/document.py b/app/repository/document.py
@@ -386,13 +386,16 @@ def _is_language_equal(
     return False
 
 
-def create(db: Session, document: DocumentCreateDTO) -> str:
+def create(
+    db: Session, document: DocumentCreateDTO, slug_name: Optional[str] = None
+) -> str:
     """
     Creates a new document.
 
     :param db Session: the database connection
     :param DocumentDTO document: the values for the new document
     :param int org_id: a validated organisation id
+    :param Optional[str] slug_name: a unique document slug or None
     :return str: The import id
     """
     try:
@@ -433,7 +436,7 @@ def create(db: Session, document: DocumentCreateDTO) -> str:
         db.add(
             Slug(
                 family_document_import_id=family_doc.import_id,
-                name=generate_slug(db, document.title),
+                name=slug_name if slug_name else generate_slug(db, document.title),
             )
         )
     except Exception as e:

diff --git a/app/repository/helpers.py b/app/repository/helpers.py
@@ -1,6 +1,6 @@
 """Helper functions for repos"""
 
-from typing import Union, cast
+from typing import Optional, Union, cast
 from uuid import uuid4
 
 from db_client.models.dfce.family import Slug
@@ -10,40 +10,62 @@
 from sqlalchemy.orm import Session
 
 
-def generate_slug(
-    db: Session,
-    title: str,
-    attempts: int = 100,
-    suffix_length: int = 4,
+def generate_unique_slug(
+    existing_slugs: set[str], title: str, attempts: int = 100, suffix_length: int = 6
 ) -> str:
     """
-    Generates a slug that doesn't already exists.
+    Generates a slug that doesn't already exist.
 
-    :param Session db: The connection to the db
-    :param str title: The title or name of the object you wish to slugify
+    :param set[str] existing_slugs: A set of already existing slugs to validate against.
+    :param str title: The title or name of the object you wish to slugify.
     :param int attempts: The number of attempt to generate a unique slug before
-    failing, defaults to 100
-    :param int suffix_length: The suffix to produce uniqueness, defaults to 4
-    :raises RuntimeError: If we cannot produce a unique slug
-    :return str: the slug generated
+    failing, defaults to 100.
+    :param int suffix_length: The suffix to produce uniqueness, defaults to 6.
+    :raises RuntimeError: If we cannot produce a unique slug.
+    :return str: The slug generated.
     """
-    lookup = set([cast(str, n) for n in db.query(Slug.name).all()])
     base = slugify(str(title))
 
     # TODO: try to extend suffix length if attempts are exhausted
     suffix = str(uuid4())[:suffix_length]
     count = 0
-    while (slug := f"{base}_{suffix}") in lookup:
+    while (slug := f"{base}_{suffix}") in existing_slugs:
         count += 1
         suffix = str(uuid4())[:suffix_length]
         if count > attempts:
             raise RuntimeError(
                 f"Failed to generate a slug for {base} after {attempts} attempts."
             )
-    lookup.add(slug)
+    existing_slugs.add(slug)
     return slug
 
 
+def generate_slug(
+    db: Session,
+    title: str,
+    attempts: int = 100,
+    suffix_length: int = 4,
+    created_slugs: Optional[set[str]] = set(),
+) -> str:
+    """
+    Generates a slug for a given title.
+
+    :param Session db: The connection to the db
+    :param str title: The title or name of the object you wish to slugify
+    :param int attempts: The number of attempt to generate a unique slug before
+    failing, defaults to 100
+    :param int suffix_length: The suffix to produce uniqueness, defaults to 4
+    :param Optional[set[str]] created_slugs: A set of slugs created in the context within which
+    this function runs that have not yet been committed to the DB
+    :return str: the slug generated
+    """
+    saved_slugs = set([cast(str, n) for n in db.query(Slug.name).all()])
+
+    existing_slugs = created_slugs.union(saved_slugs) if created_slugs else saved_slugs
+
+    return generate_unique_slug(existing_slugs, title, attempts, suffix_length)
+
+
 def generate_import_id(
     db: Session, entity_type: CountedEntity, org: Union[str, int]
 ) -> str:

diff --git a/app/service/ingest.py b/app/service/ingest.py
@@ -30,6 +30,7 @@
     IngestEventDTO,
     IngestFamilyDTO,
 )
+from app.repository.helpers import generate_slug
 
 DOCUMENT_INGEST_LIMIT = 1000
 _LOGGER = logging.getLogger(__name__)
@@ -162,6 +163,7 @@ def save_documents(
     validation.validate_documents(document_data, corpus_import_id)
 
     document_import_ids = []
+    document_slugs = set()
     total_documents_saved = 0
 
     for doc in document_data:
@@ -171,7 +173,9 @@ def save_documents(
         ):
             _LOGGER.info(f"Importing document {doc['import_id']}")
             dto = IngestDocumentDTO(**doc).to_document_create_dto()
-            import_id = document_repository.create(db, dto)
+            slug = generate_slug(db=db, title=dto.title, created_slugs=document_slugs)
+            import_id = document_repository.create(db, dto, slug)
+            document_slugs.add(slug)
             document_import_ids.append(import_id)
             total_documents_saved += 1
 
@@ -226,6 +230,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
     notification_service.send_notification(
         f"🚀 Bulk import for corpus: {corpus_import_id} has started."
     )
+    end_message = ""
 
     # ingest_uuid = uuid4()
     # upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data)
@@ -259,16 +264,15 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
 
         # upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result)
 
-        notification_service.send_notification(
+        end_message = (
             f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed."
         )
+        db.commit()
     except Exception as e:
         _LOGGER.error(
             f"Rolling back transaction due to the following error: {e}", exc_info=True
         )
         db.rollback()
-        notification_service.send_notification(
-            f"💥 Bulk import for corpus: {corpus_import_id} has failed."
-        )
+        end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed."
     finally:
-        db.commit()
+        notification_service.send_notification(end_message)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "admin_backend"
-version = "2.17.3"
+version = "2.17.4"
 description = ""
 authors = ["CPR-dev-team <[email protected]>"]
 packages = [{ include = "app" }, { include = "tests" }]

diff --git a/tests/integration_tests/ingest/test_ingest.py b/tests/integration_tests/ingest/test_ingest.py
@@ -238,6 +238,67 @@ def test_ingest_idempotency(
     )
 
 
+@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
+def test_generates_unique_slugs_for_documents_with_identical_titles(
+    caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client
+):
+    """
+    This test ensures that given multiple documents with the same title a unique slug
+    is generated for each and thus the documents can be saved to the DB at the end
+    of bulk import. However, the current length of the suffix added to the slug
+    to ensure uniqueness (6), means that the likelihood of a collision is extremely low.
+    """
+    family_import_id = "test.new.family.0"
+    test_data = {
+        "collections": [],
+        "families": [
+            {
+                "import_id": family_import_id,
+                "title": "Test",
+                "summary": "Test",
+                "geographies": ["South Asia"],
+                "category": "UNFCCC",
+                "metadata": {"author_type": ["Non-Party"], "author": ["Test"]},
+                "collections": [],
+            }
+        ],
+        "documents": [
+            {
+                "import_id": f"test.new.document.{i}",
+                "family_import_id": family_import_id,
+                "metadata": {"role": ["MAIN"], "type": ["Law"]},
+                "variant_name": "Original Language",
+                "title": "Project Document",
+                "user_language_name": "",
+            }
+            for i in range(1000)
+        ],
+        "events": [],
+    }
+    test_json = json.dumps(test_data).encode("utf-8")
+    test_data_file = io.BytesIO(test_json)
+
+    with caplog.at_level(logging.ERROR):
+        first_response = client.post(
+            "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
+            files={"new_data": test_data_file},
+            headers=user_header_token,
+        )
+
+        assert first_response.status_code == status.HTTP_202_ACCEPTED
+        assert first_response.json() == {
+            "message": "Bulk import request accepted. Check Cloudwatch logs for result."
+        }
+
+    assert (
+        "Created"
+        == data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == "test.new.document.999")
+        .one_or_none()
+        .document_status
+    )
+
+
 @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
 def test_ingest_when_corpus_import_id_invalid(
     caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client

diff --git a/tests/mocks/repos/document_repo.py b/tests/mocks/repos/document_repo.py
@@ -55,7 +55,7 @@ def mock_update(_, import_id: str, data: DocumentReadDTO) -> DocumentReadDTO:
             raise exc.NoResultFound()
         return create_document_read_dto("a.b.c.d")
 
-    def mock_create(_, data: DocumentCreateDTO) -> str:
+    def mock_create(_, data: DocumentCreateDTO, slug: Optional[str] = "") -> str:
         maybe_throw()
         if document_repo.return_empty:
             raise exc.NoResultFound()

diff --git a/tests/unit_tests/repository/test_helpers.py b/tests/unit_tests/repository/test_helpers.py
@@ -0,0 +1,48 @@
+import uuid
+from unittest.mock import patch
+
+import pytest
+
+from app.repository.helpers import generate_unique_slug
+
+
+def test_successfully_generates_a_slug_with_a_four_digit_suffix():
+    title = "Test title"
+    generated_slug = generate_unique_slug(set(), title)
+
+    expected_slugified_title = "test-title_"
+    expected_suffix_length = 6
+
+    assert expected_slugified_title in generated_slug
+    assert len(expected_slugified_title) + expected_suffix_length == len(generated_slug)
+
+
+def test_successfully_generates_a_unique_slug_if_one_already_exists():
+    title = "Test title"
+
+    existing_slug = generate_unique_slug(set(), title)
+    existing_suffix = existing_slug.split("_")[1]
+
+    with patch(
+        "app.repository.helpers.uuid4",
+        side_effect=[existing_suffix, uuid.uuid4],
+    ):
+        generated_slug = generate_unique_slug({existing_slug}, title)
+
+    assert existing_slug != generated_slug
+
+
+def test_raises_error_if_a_unique_slug_cannot_be_created_in_a_specified_number_of_attempts():
+    title = "Test title"
+
+    existing_slug = generate_unique_slug(set(), title)
+    existing_suffix = existing_slug.split("_")[1]
+
+    with (
+        patch(
+            "app.repository.helpers.uuid4",
+            return_value=existing_suffix,
+        ),
+        pytest.raises(RuntimeError),
+    ):
+        generate_unique_slug({existing_slug}, title, 2)