From 969d166863a65a4e487e49900afe34988a1d91bc Mon Sep 17 00:00:00 2001 From: annaCPR Date: Thu, 31 Oct 2024 10:59:42 +0000 Subject: [PATCH] Feature/pdct 1628 handle slugs for documents with the same title (#237) * Handle duplicate titles when creating slugs during bulk import * Refactor to surface commit errors better * Increase suffix length to 6 * Bump patch version * Update poetry.lock file --- app/repository/document.py | 7 ++- app/repository/helpers.py | 54 +++++++++++----- app/service/ingest.py | 16 +++-- poetry.lock | 30 ++++----- pyproject.toml | 2 +- tests/integration_tests/ingest/test_ingest.py | 61 +++++++++++++++++++ tests/mocks/repos/document_repo.py | 2 +- tests/unit_tests/repository/test_helpers.py | 48 +++++++++++++++ .../service/ingest/test_ingest_service.py | 1 + 9 files changed, 180 insertions(+), 41 deletions(-) create mode 100644 tests/unit_tests/repository/test_helpers.py diff --git a/app/repository/document.py b/app/repository/document.py index 1cf384cb..8f60c5ac 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -386,13 +386,16 @@ def _is_language_equal( return False -def create(db: Session, document: DocumentCreateDTO) -> str: +def create( + db: Session, document: DocumentCreateDTO, slug_name: Optional[str] = None +) -> str: """ Creates a new document. :param db Session: the database connection :param DocumentDTO document: the values for the new document :param int org_id: a validated organisation id + :param Optional[str] slug_name: a unique document slug or None :return str: The import id """ try: @@ -433,7 +436,7 @@ def create(db: Session, document: DocumentCreateDTO) -> str: db.add( Slug( family_document_import_id=family_doc.import_id, - name=generate_slug(db, document.title), + name=slug_name if slug_name else generate_slug(db, document.title), ) ) except Exception as e: diff --git a/app/repository/helpers.py b/app/repository/helpers.py index 0752662f..eff1eb14 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,6 +1,6 @@ """Helper functions for repos""" -from typing import Union, cast +from typing import Optional, Union, cast from uuid import uuid4 from db_client.models.dfce.family import Slug @@ -10,40 +10,62 @@ from sqlalchemy.orm import Session -def generate_slug( - db: Session, - title: str, - attempts: int = 100, - suffix_length: int = 4, +def generate_unique_slug( + existing_slugs: set[str], title: str, attempts: int = 100, suffix_length: int = 6 ) -> str: """ - Generates a slug that doesn't already exists. + Generates a slug that doesn't already exist. - :param Session db: The connection to the db - :param str title: The title or name of the object you wish to slugify + :param set[str] existing_slugs: A set of already existing slugs to validate against. + :param str title: The title or name of the object you wish to slugify. :param int attempts: The number of attempt to generate a unique slug before - failing, defaults to 100 - :param int suffix_length: The suffix to produce uniqueness, defaults to 4 - :raises RuntimeError: If we cannot produce a unique slug - :return str: the slug generated + failing, defaults to 100. + :param int suffix_length: The suffix to produce uniqueness, defaults to 6. + :raises RuntimeError: If we cannot produce a unique slug. + :return str: The slug generated. """ - lookup = set([cast(str, n) for n in db.query(Slug.name).all()]) base = slugify(str(title)) # TODO: try to extend suffix length if attempts are exhausted suffix = str(uuid4())[:suffix_length] count = 0 - while (slug := f"{base}_{suffix}") in lookup: + while (slug := f"{base}_{suffix}") in existing_slugs: count += 1 suffix = str(uuid4())[:suffix_length] if count > attempts: raise RuntimeError( f"Failed to generate a slug for {base} after {attempts} attempts." ) - lookup.add(slug) + existing_slugs.add(slug) return slug +def generate_slug( + db: Session, + title: str, + attempts: int = 100, + suffix_length: int = 4, + created_slugs: Optional[set[str]] = set(), +) -> str: + """ + Generates a slug for a given title. + + :param Session db: The connection to the db + :param str title: The title or name of the object you wish to slugify + :param int attempts: The number of attempt to generate a unique slug before + failing, defaults to 100 + :param int suffix_length: The suffix to produce uniqueness, defaults to 4 + :param Optional[set[str]] created_slugs: A set of slugs created in the context within which + this function runs that have not yet been committed to the DB + :return str: the slug generated + """ + saved_slugs = set([cast(str, n) for n in db.query(Slug.name).all()]) + + existing_slugs = created_slugs.union(saved_slugs) if created_slugs else saved_slugs + + return generate_unique_slug(existing_slugs, title, attempts, suffix_length) + + def generate_import_id( db: Session, entity_type: CountedEntity, org: Union[str, int] ) -> str: diff --git a/app/service/ingest.py b/app/service/ingest.py index bfe5f51f..1006fbda 100644 --- a/app/service/ingest.py +++ b/app/service/ingest.py @@ -30,6 +30,7 @@ IngestEventDTO, IngestFamilyDTO, ) +from app.repository.helpers import generate_slug DOCUMENT_INGEST_LIMIT = 1000 _LOGGER = logging.getLogger(__name__) @@ -162,6 +163,7 @@ def save_documents( validation.validate_documents(document_data, corpus_import_id) document_import_ids = [] + document_slugs = set() total_documents_saved = 0 for doc in document_data: @@ -171,7 +173,9 @@ def save_documents( ): _LOGGER.info(f"Importing document {doc['import_id']}") dto = IngestDocumentDTO(**doc).to_document_create_dto() - import_id = document_repository.create(db, dto) + slug = generate_slug(db=db, title=dto.title, created_slugs=document_slugs) + import_id = document_repository.create(db, dto, slug) + document_slugs.add(slug) document_import_ids.append(import_id) total_documents_saved += 1 @@ -226,6 +230,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: notification_service.send_notification( f"🚀 Bulk import for corpus: {corpus_import_id} has started." ) + end_message = "" # ingest_uuid = uuid4() # upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data) @@ -259,16 +264,15 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: # upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result) - notification_service.send_notification( + end_message = ( f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed." ) + db.commit() except Exception as e: _LOGGER.error( f"Rolling back transaction due to the following error: {e}", exc_info=True ) db.rollback() - notification_service.send_notification( - f"💥 Bulk import for corpus: {corpus_import_id} has failed." - ) + end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed." finally: - db.commit() + notification_service.send_notification(end_message) diff --git a/poetry.lock b/poetry.lock index ad528f7d..51a56227 100644 --- a/poetry.lock +++ b/poetry.lock @@ -170,17 +170,17 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.35.49" +version = "1.35.51" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.49-py3-none-any.whl", hash = "sha256:b660c649a27a6b47a34f6f858f5bd7c3b0a798a16dec8dda7cbebeee80fd1f60"}, - {file = "boto3-1.35.49.tar.gz", hash = "sha256:ddecb27f5699ca9f97711c52b6c0652c2e63bf6c2bfbc13b819b4f523b4d30ff"}, + {file = "boto3-1.35.51-py3-none-any.whl", hash = "sha256:c922f6a18958af9d8af0489d6d8503b517029d8159b26aa4859a8294561c72e9"}, + {file = "boto3-1.35.51.tar.gz", hash = "sha256:a57c6c7012ecb40c43e565a6f7a891f39efa990ff933eab63cd456f7501c2731"}, ] [package.dependencies] -botocore = ">=1.35.49,<1.36.0" +botocore = ">=1.35.51,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -189,13 +189,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.49" +version = "1.35.51" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.49-py3-none-any.whl", hash = "sha256:aed4d3643afd702920792b68fbe712a8c3847993820d1048cd238a6469354da1"}, - {file = "botocore-1.35.49.tar.gz", hash = "sha256:07d0c1325fdbfa49a4a054413dbdeab0a6030449b2aa66099241af2dac48afd8"}, + {file = "botocore-1.35.51-py3-none-any.whl", hash = "sha256:4d65b00111bd12b98e9f920ecab602cf619cc6a6d0be6e5dd53f517e4b92901c"}, + {file = "botocore-1.35.51.tar.gz", hash = "sha256:a9b3d1da76b3e896ad74605c01d88f596324a3337393d4bfbfa0d6c35822ca9c"}, ] [package.dependencies] @@ -1790,8 +1790,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.6.1", markers = "python_version < \"3.13\""}, {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, ] [package.extras] @@ -1931,8 +1931,8 @@ files = [ astroid = ">=3.3.4,<=3.4.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.2", markers = "python_version < \"3.11\""}, {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, + {version = ">=0.2", markers = "python_version < \"3.11\""}, {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] isort = ">=4.2.5,<5.13.0 || >5.13.0,<6" @@ -2415,13 +2415,13 @@ files = [ [[package]] name = "slack-sdk" -version = "3.33.2" +version = "3.33.3" description = "The Slack API Platform SDK for Python" optional = false python-versions = ">=3.6" files = [ - {file = "slack_sdk-3.33.2-py2.py3-none-any.whl", hash = "sha256:8107912488028f5a3f04ee9c58524418d3a2763a843db25530375b7733e6ec0a"}, - {file = "slack_sdk-3.33.2.tar.gz", hash = "sha256:34c51cfb9c254553219a0bba7a741c913f5d4d372d6278c3e7e10fe7bb667724"}, + {file = "slack_sdk-3.33.3-py2.py3-none-any.whl", hash = "sha256:0515fb93cd03b18de61f876a8304c4c3cef4dd3c2a3bad62d7394d2eb5a3c8e6"}, + {file = "slack_sdk-3.33.3.tar.gz", hash = "sha256:4cc44c9ffe4bb28a01fbe3264c2f466c783b893a4eca62026ab845ec7c176ff1"}, ] [package.extras] @@ -2721,13 +2721,13 @@ test = ["aiohttp (>=3.10.5)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", [[package]] name = "virtualenv" -version = "20.27.0" +version = "20.27.1" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" files = [ - {file = "virtualenv-20.27.0-py3-none-any.whl", hash = "sha256:44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655"}, - {file = "virtualenv-20.27.0.tar.gz", hash = "sha256:2ca56a68ed615b8fe4326d11a0dca5dfbe8fd68510fb6c6349163bed3c15f2b2"}, + {file = "virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4"}, + {file = "virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba"}, ] [package.dependencies] diff --git a/pyproject.toml b/pyproject.toml index e2f6df54..cafdc609 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "admin_backend" -version = "2.17.3" +version = "2.17.4" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] diff --git a/tests/integration_tests/ingest/test_ingest.py b/tests/integration_tests/ingest/test_ingest.py index 43e90653..553e5009 100644 --- a/tests/integration_tests/ingest/test_ingest.py +++ b/tests/integration_tests/ingest/test_ingest.py @@ -238,6 +238,67 @@ def test_ingest_idempotency( ) +@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) +def test_generates_unique_slugs_for_documents_with_identical_titles( + caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client +): + """ + This test ensures that given multiple documents with the same title a unique slug + is generated for each and thus the documents can be saved to the DB at the end + of bulk import. However, the current length of the suffix added to the slug + to ensure uniqueness (6), means that the likelihood of a collision is extremely low. + """ + family_import_id = "test.new.family.0" + test_data = { + "collections": [], + "families": [ + { + "import_id": family_import_id, + "title": "Test", + "summary": "Test", + "geographies": ["South Asia"], + "category": "UNFCCC", + "metadata": {"author_type": ["Non-Party"], "author": ["Test"]}, + "collections": [], + } + ], + "documents": [ + { + "import_id": f"test.new.document.{i}", + "family_import_id": family_import_id, + "metadata": {"role": ["MAIN"], "type": ["Law"]}, + "variant_name": "Original Language", + "title": "Project Document", + "user_language_name": "", + } + for i in range(1000) + ], + "events": [], + } + test_json = json.dumps(test_data).encode("utf-8") + test_data_file = io.BytesIO(test_json) + + with caplog.at_level(logging.ERROR): + first_response = client.post( + "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", + files={"new_data": test_data_file}, + headers=user_header_token, + ) + + assert first_response.status_code == status.HTTP_202_ACCEPTED + assert first_response.json() == { + "message": "Bulk import request accepted. Check Cloudwatch logs for result." + } + + assert ( + "Created" + == data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == "test.new.document.999") + .one_or_none() + .document_status + ) + + @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) def test_ingest_when_corpus_import_id_invalid( caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client diff --git a/tests/mocks/repos/document_repo.py b/tests/mocks/repos/document_repo.py index e80ab9eb..272b1490 100644 --- a/tests/mocks/repos/document_repo.py +++ b/tests/mocks/repos/document_repo.py @@ -55,7 +55,7 @@ def mock_update(_, import_id: str, data: DocumentReadDTO) -> DocumentReadDTO: raise exc.NoResultFound() return create_document_read_dto("a.b.c.d") - def mock_create(_, data: DocumentCreateDTO) -> str: + def mock_create(_, data: DocumentCreateDTO, slug: Optional[str] = "") -> str: maybe_throw() if document_repo.return_empty: raise exc.NoResultFound() diff --git a/tests/unit_tests/repository/test_helpers.py b/tests/unit_tests/repository/test_helpers.py new file mode 100644 index 00000000..a775664d --- /dev/null +++ b/tests/unit_tests/repository/test_helpers.py @@ -0,0 +1,48 @@ +import uuid +from unittest.mock import patch + +import pytest + +from app.repository.helpers import generate_unique_slug + + +def test_successfully_generates_a_slug_with_a_four_digit_suffix(): + title = "Test title" + generated_slug = generate_unique_slug(set(), title) + + expected_slugified_title = "test-title_" + expected_suffix_length = 6 + + assert expected_slugified_title in generated_slug + assert len(expected_slugified_title) + expected_suffix_length == len(generated_slug) + + +def test_successfully_generates_a_unique_slug_if_one_already_exists(): + title = "Test title" + + existing_slug = generate_unique_slug(set(), title) + existing_suffix = existing_slug.split("_")[1] + + with patch( + "app.repository.helpers.uuid4", + side_effect=[existing_suffix, uuid.uuid4], + ): + generated_slug = generate_unique_slug({existing_slug}, title) + + assert existing_slug != generated_slug + + +def test_raises_error_if_a_unique_slug_cannot_be_created_in_a_specified_number_of_attempts(): + title = "Test title" + + existing_slug = generate_unique_slug(set(), title) + existing_suffix = existing_slug.split("_")[1] + + with ( + patch( + "app.repository.helpers.uuid4", + return_value=existing_suffix, + ), + pytest.raises(RuntimeError), + ): + generate_unique_slug({existing_slug}, title, 2) diff --git a/tests/unit_tests/service/ingest/test_ingest_service.py b/tests/unit_tests/service/ingest/test_ingest_service.py index 7460c262..81245286 100644 --- a/tests/unit_tests/service/ingest/test_ingest_service.py +++ b/tests/unit_tests/service/ingest/test_ingest_service.py @@ -203,6 +203,7 @@ def test_save_documents_when_data_invalid(validation_service_mock): assert "Error" == e.value.message +@patch("app.service.ingest.generate_slug", Mock(return_value="test-slug_1234")) @patch("app.service.ingest._exists_in_db", Mock(return_value=False)) def test_do_not_save_documents_over_ingest_limit( validation_service_mock, document_repo_mock, monkeypatch