From 969d166863a65a4e487e49900afe34988a1d91bc Mon Sep 17 00:00:00 2001
From: annaCPR <anna@climatepolicyradar.org>
Date: Thu, 31 Oct 2024 10:59:42 +0000
Subject: [PATCH] Feature/pdct 1628 handle slugs for documents with the same
 title (#237)

* Handle duplicate titles when creating slugs during bulk import

* Refactor to surface commit errors better

* Increase suffix length to 6

* Bump patch version

* Update poetry.lock file
---
 app/repository/document.py                    |  7 ++-
 app/repository/helpers.py                     | 54 +++++++++++-----
 app/service/ingest.py                         | 16 +++--
 poetry.lock                                   | 30 ++++-----
 pyproject.toml                                |  2 +-
 tests/integration_tests/ingest/test_ingest.py | 61 +++++++++++++++++++
 tests/mocks/repos/document_repo.py            |  2 +-
 tests/unit_tests/repository/test_helpers.py   | 48 +++++++++++++++
 .../service/ingest/test_ingest_service.py     |  1 +
 9 files changed, 180 insertions(+), 41 deletions(-)
 create mode 100644 tests/unit_tests/repository/test_helpers.py

diff --git a/app/repository/document.py b/app/repository/document.py
index 1cf384cb..8f60c5ac 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -386,13 +386,16 @@ def _is_language_equal(
     return False
 
 
-def create(db: Session, document: DocumentCreateDTO) -> str:
+def create(
+    db: Session, document: DocumentCreateDTO, slug_name: Optional[str] = None
+) -> str:
     """
     Creates a new document.
 
     :param db Session: the database connection
     :param DocumentDTO document: the values for the new document
     :param int org_id: a validated organisation id
+    :param Optional[str] slug_name: a unique document slug or None
     :return str: The import id
     """
     try:
@@ -433,7 +436,7 @@ def create(db: Session, document: DocumentCreateDTO) -> str:
         db.add(
             Slug(
                 family_document_import_id=family_doc.import_id,
-                name=generate_slug(db, document.title),
+                name=slug_name if slug_name else generate_slug(db, document.title),
             )
         )
     except Exception as e:
diff --git a/app/repository/helpers.py b/app/repository/helpers.py
index 0752662f..eff1eb14 100644
--- a/app/repository/helpers.py
+++ b/app/repository/helpers.py
@@ -1,6 +1,6 @@
 """Helper functions for repos"""
 
-from typing import Union, cast
+from typing import Optional, Union, cast
 from uuid import uuid4
 
 from db_client.models.dfce.family import Slug
@@ -10,40 +10,62 @@
 from sqlalchemy.orm import Session
 
 
-def generate_slug(
-    db: Session,
-    title: str,
-    attempts: int = 100,
-    suffix_length: int = 4,
+def generate_unique_slug(
+    existing_slugs: set[str], title: str, attempts: int = 100, suffix_length: int = 6
 ) -> str:
     """
-    Generates a slug that doesn't already exists.
+    Generates a slug that doesn't already exist.
 
-    :param Session db: The connection to the db
-    :param str title: The title or name of the object you wish to slugify
+    :param set[str] existing_slugs: A set of already existing slugs to validate against.
+    :param str title: The title or name of the object you wish to slugify.
     :param int attempts: The number of attempt to generate a unique slug before
-    failing, defaults to 100
-    :param int suffix_length: The suffix to produce uniqueness, defaults to 4
-    :raises RuntimeError: If we cannot produce a unique slug
-    :return str: the slug generated
+    failing, defaults to 100.
+    :param int suffix_length: The suffix to produce uniqueness, defaults to 6.
+    :raises RuntimeError: If we cannot produce a unique slug.
+    :return str: The slug generated.
     """
-    lookup = set([cast(str, n) for n in db.query(Slug.name).all()])
     base = slugify(str(title))
 
     # TODO: try to extend suffix length if attempts are exhausted
     suffix = str(uuid4())[:suffix_length]
     count = 0
-    while (slug := f"{base}_{suffix}") in lookup:
+    while (slug := f"{base}_{suffix}") in existing_slugs:
         count += 1
         suffix = str(uuid4())[:suffix_length]
         if count > attempts:
             raise RuntimeError(
                 f"Failed to generate a slug for {base} after {attempts} attempts."
             )
-    lookup.add(slug)
+    existing_slugs.add(slug)
     return slug
 
 
+def generate_slug(
+    db: Session,
+    title: str,
+    attempts: int = 100,
+    suffix_length: int = 4,
+    created_slugs: Optional[set[str]] = set(),
+) -> str:
+    """
+    Generates a slug for a given title.
+
+    :param Session db: The connection to the db
+    :param str title: The title or name of the object you wish to slugify
+    :param int attempts: The number of attempt to generate a unique slug before
+    failing, defaults to 100
+    :param int suffix_length: The suffix to produce uniqueness, defaults to 4
+    :param Optional[set[str]] created_slugs: A set of slugs created in the context within which
+    this function runs that have not yet been committed to the DB
+    :return str: the slug generated
+    """
+    saved_slugs = set([cast(str, n) for n in db.query(Slug.name).all()])
+
+    existing_slugs = created_slugs.union(saved_slugs) if created_slugs else saved_slugs
+
+    return generate_unique_slug(existing_slugs, title, attempts, suffix_length)
+
+
 def generate_import_id(
     db: Session, entity_type: CountedEntity, org: Union[str, int]
 ) -> str:
diff --git a/app/service/ingest.py b/app/service/ingest.py
index bfe5f51f..1006fbda 100644
--- a/app/service/ingest.py
+++ b/app/service/ingest.py
@@ -30,6 +30,7 @@
     IngestEventDTO,
     IngestFamilyDTO,
 )
+from app.repository.helpers import generate_slug
 
 DOCUMENT_INGEST_LIMIT = 1000
 _LOGGER = logging.getLogger(__name__)
@@ -162,6 +163,7 @@ def save_documents(
     validation.validate_documents(document_data, corpus_import_id)
 
     document_import_ids = []
+    document_slugs = set()
     total_documents_saved = 0
 
     for doc in document_data:
@@ -171,7 +173,9 @@ def save_documents(
         ):
             _LOGGER.info(f"Importing document {doc['import_id']}")
             dto = IngestDocumentDTO(**doc).to_document_create_dto()
-            import_id = document_repository.create(db, dto)
+            slug = generate_slug(db=db, title=dto.title, created_slugs=document_slugs)
+            import_id = document_repository.create(db, dto, slug)
+            document_slugs.add(slug)
             document_import_ids.append(import_id)
             total_documents_saved += 1
 
@@ -226,6 +230,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
     notification_service.send_notification(
         f"🚀 Bulk import for corpus: {corpus_import_id} has started."
     )
+    end_message = ""
 
     # ingest_uuid = uuid4()
     # upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data)
@@ -259,16 +264,15 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
 
         # upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result)
 
-        notification_service.send_notification(
+        end_message = (
             f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed."
         )
+        db.commit()
     except Exception as e:
         _LOGGER.error(
             f"Rolling back transaction due to the following error: {e}", exc_info=True
         )
         db.rollback()
-        notification_service.send_notification(
-            f"💥 Bulk import for corpus: {corpus_import_id} has failed."
-        )
+        end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed."
     finally:
-        db.commit()
+        notification_service.send_notification(end_message)
diff --git a/poetry.lock b/poetry.lock
index ad528f7d..51a56227 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -170,17 +170,17 @@ uvloop = ["uvloop (>=0.15.2)"]
 
 [[package]]
 name = "boto3"
-version = "1.35.49"
+version = "1.35.51"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.35.49-py3-none-any.whl", hash = "sha256:b660c649a27a6b47a34f6f858f5bd7c3b0a798a16dec8dda7cbebeee80fd1f60"},
-    {file = "boto3-1.35.49.tar.gz", hash = "sha256:ddecb27f5699ca9f97711c52b6c0652c2e63bf6c2bfbc13b819b4f523b4d30ff"},
+    {file = "boto3-1.35.51-py3-none-any.whl", hash = "sha256:c922f6a18958af9d8af0489d6d8503b517029d8159b26aa4859a8294561c72e9"},
+    {file = "boto3-1.35.51.tar.gz", hash = "sha256:a57c6c7012ecb40c43e565a6f7a891f39efa990ff933eab63cd456f7501c2731"},
 ]
 
 [package.dependencies]
-botocore = ">=1.35.49,<1.36.0"
+botocore = ">=1.35.51,<1.36.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"
 
@@ -189,13 +189,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.35.49"
+version = "1.35.51"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.35.49-py3-none-any.whl", hash = "sha256:aed4d3643afd702920792b68fbe712a8c3847993820d1048cd238a6469354da1"},
-    {file = "botocore-1.35.49.tar.gz", hash = "sha256:07d0c1325fdbfa49a4a054413dbdeab0a6030449b2aa66099241af2dac48afd8"},
+    {file = "botocore-1.35.51-py3-none-any.whl", hash = "sha256:4d65b00111bd12b98e9f920ecab602cf619cc6a6d0be6e5dd53f517e4b92901c"},
+    {file = "botocore-1.35.51.tar.gz", hash = "sha256:a9b3d1da76b3e896ad74605c01d88f596324a3337393d4bfbfa0d6c35822ca9c"},
 ]
 
 [package.dependencies]
@@ -1790,8 +1790,8 @@ files = [
 annotated-types = ">=0.6.0"
 pydantic-core = "2.23.4"
 typing-extensions = [
-    {version = ">=4.6.1", markers = "python_version < \"3.13\""},
     {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
+    {version = ">=4.6.1", markers = "python_version < \"3.13\""},
 ]
 
 [package.extras]
@@ -1931,8 +1931,8 @@ files = [
 astroid = ">=3.3.4,<=3.4.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.2", markers = "python_version < \"3.11\""},
     {version = ">=0.3.7", markers = "python_version >= \"3.12\""},
+    {version = ">=0.2", markers = "python_version < \"3.11\""},
     {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
 ]
 isort = ">=4.2.5,<5.13.0 || >5.13.0,<6"
@@ -2415,13 +2415,13 @@ files = [
 
 [[package]]
 name = "slack-sdk"
-version = "3.33.2"
+version = "3.33.3"
 description = "The Slack API Platform SDK for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "slack_sdk-3.33.2-py2.py3-none-any.whl", hash = "sha256:8107912488028f5a3f04ee9c58524418d3a2763a843db25530375b7733e6ec0a"},
-    {file = "slack_sdk-3.33.2.tar.gz", hash = "sha256:34c51cfb9c254553219a0bba7a741c913f5d4d372d6278c3e7e10fe7bb667724"},
+    {file = "slack_sdk-3.33.3-py2.py3-none-any.whl", hash = "sha256:0515fb93cd03b18de61f876a8304c4c3cef4dd3c2a3bad62d7394d2eb5a3c8e6"},
+    {file = "slack_sdk-3.33.3.tar.gz", hash = "sha256:4cc44c9ffe4bb28a01fbe3264c2f466c783b893a4eca62026ab845ec7c176ff1"},
 ]
 
 [package.extras]
@@ -2721,13 +2721,13 @@ test = ["aiohttp (>=3.10.5)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil",
 
 [[package]]
 name = "virtualenv"
-version = "20.27.0"
+version = "20.27.1"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "virtualenv-20.27.0-py3-none-any.whl", hash = "sha256:44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655"},
-    {file = "virtualenv-20.27.0.tar.gz", hash = "sha256:2ca56a68ed615b8fe4326d11a0dca5dfbe8fd68510fb6c6349163bed3c15f2b2"},
+    {file = "virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4"},
+    {file = "virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba"},
 ]
 
 [package.dependencies]
diff --git a/pyproject.toml b/pyproject.toml
index e2f6df54..cafdc609 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "admin_backend"
-version = "2.17.3"
+version = "2.17.4"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]
diff --git a/tests/integration_tests/ingest/test_ingest.py b/tests/integration_tests/ingest/test_ingest.py
index 43e90653..553e5009 100644
--- a/tests/integration_tests/ingest/test_ingest.py
+++ b/tests/integration_tests/ingest/test_ingest.py
@@ -238,6 +238,67 @@ def test_ingest_idempotency(
     )
 
 
+@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
+def test_generates_unique_slugs_for_documents_with_identical_titles(
+    caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client
+):
+    """
+    This test ensures that given multiple documents with the same title a unique slug
+    is generated for each and thus the documents can be saved to the DB at the end
+    of bulk import. However, the current length of the suffix added to the slug
+    to ensure uniqueness (6), means that the likelihood of a collision is extremely low.
+    """
+    family_import_id = "test.new.family.0"
+    test_data = {
+        "collections": [],
+        "families": [
+            {
+                "import_id": family_import_id,
+                "title": "Test",
+                "summary": "Test",
+                "geographies": ["South Asia"],
+                "category": "UNFCCC",
+                "metadata": {"author_type": ["Non-Party"], "author": ["Test"]},
+                "collections": [],
+            }
+        ],
+        "documents": [
+            {
+                "import_id": f"test.new.document.{i}",
+                "family_import_id": family_import_id,
+                "metadata": {"role": ["MAIN"], "type": ["Law"]},
+                "variant_name": "Original Language",
+                "title": "Project Document",
+                "user_language_name": "",
+            }
+            for i in range(1000)
+        ],
+        "events": [],
+    }
+    test_json = json.dumps(test_data).encode("utf-8")
+    test_data_file = io.BytesIO(test_json)
+
+    with caplog.at_level(logging.ERROR):
+        first_response = client.post(
+            "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
+            files={"new_data": test_data_file},
+            headers=user_header_token,
+        )
+
+        assert first_response.status_code == status.HTTP_202_ACCEPTED
+        assert first_response.json() == {
+            "message": "Bulk import request accepted. Check Cloudwatch logs for result."
+        }
+
+    assert (
+        "Created"
+        == data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == "test.new.document.999")
+        .one_or_none()
+        .document_status
+    )
+
+
 @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
 def test_ingest_when_corpus_import_id_invalid(
     caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client
diff --git a/tests/mocks/repos/document_repo.py b/tests/mocks/repos/document_repo.py
index e80ab9eb..272b1490 100644
--- a/tests/mocks/repos/document_repo.py
+++ b/tests/mocks/repos/document_repo.py
@@ -55,7 +55,7 @@ def mock_update(_, import_id: str, data: DocumentReadDTO) -> DocumentReadDTO:
             raise exc.NoResultFound()
         return create_document_read_dto("a.b.c.d")
 
-    def mock_create(_, data: DocumentCreateDTO) -> str:
+    def mock_create(_, data: DocumentCreateDTO, slug: Optional[str] = "") -> str:
         maybe_throw()
         if document_repo.return_empty:
             raise exc.NoResultFound()
diff --git a/tests/unit_tests/repository/test_helpers.py b/tests/unit_tests/repository/test_helpers.py
new file mode 100644
index 00000000..a775664d
--- /dev/null
+++ b/tests/unit_tests/repository/test_helpers.py
@@ -0,0 +1,48 @@
+import uuid
+from unittest.mock import patch
+
+import pytest
+
+from app.repository.helpers import generate_unique_slug
+
+
+def test_successfully_generates_a_slug_with_a_four_digit_suffix():
+    title = "Test title"
+    generated_slug = generate_unique_slug(set(), title)
+
+    expected_slugified_title = "test-title_"
+    expected_suffix_length = 6
+
+    assert expected_slugified_title in generated_slug
+    assert len(expected_slugified_title) + expected_suffix_length == len(generated_slug)
+
+
+def test_successfully_generates_a_unique_slug_if_one_already_exists():
+    title = "Test title"
+
+    existing_slug = generate_unique_slug(set(), title)
+    existing_suffix = existing_slug.split("_")[1]
+
+    with patch(
+        "app.repository.helpers.uuid4",
+        side_effect=[existing_suffix, uuid.uuid4],
+    ):
+        generated_slug = generate_unique_slug({existing_slug}, title)
+
+    assert existing_slug != generated_slug
+
+
+def test_raises_error_if_a_unique_slug_cannot_be_created_in_a_specified_number_of_attempts():
+    title = "Test title"
+
+    existing_slug = generate_unique_slug(set(), title)
+    existing_suffix = existing_slug.split("_")[1]
+
+    with (
+        patch(
+            "app.repository.helpers.uuid4",
+            return_value=existing_suffix,
+        ),
+        pytest.raises(RuntimeError),
+    ):
+        generate_unique_slug({existing_slug}, title, 2)
diff --git a/tests/unit_tests/service/ingest/test_ingest_service.py b/tests/unit_tests/service/ingest/test_ingest_service.py
index 7460c262..81245286 100644
--- a/tests/unit_tests/service/ingest/test_ingest_service.py
+++ b/tests/unit_tests/service/ingest/test_ingest_service.py
@@ -203,6 +203,7 @@ def test_save_documents_when_data_invalid(validation_service_mock):
     assert "Error" == e.value.message
 
 
+@patch("app.service.ingest.generate_slug", Mock(return_value="test-slug_1234"))
 @patch("app.service.ingest._exists_in_db", Mock(return_value=False))
 def test_do_not_save_documents_over_ingest_limit(
     validation_service_mock, document_repo_mock, monkeypatch