Skip to content

Commit

Permalink
Feature/pdct 1628 handle slugs for documents with the same title (#237)
Browse files Browse the repository at this point in the history
* Handle duplicate titles when creating slugs during bulk import

* Refactor to surface commit errors better

* Increase suffix length to 6

* Bump patch version

* Update poetry.lock file
  • Loading branch information
annaCPR authored Oct 31, 2024
1 parent 0c67391 commit 969d166
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 41 deletions.
7 changes: 5 additions & 2 deletions app/repository/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,13 +386,16 @@ def _is_language_equal(
return False


def create(db: Session, document: DocumentCreateDTO) -> str:
def create(
db: Session, document: DocumentCreateDTO, slug_name: Optional[str] = None
) -> str:
"""
Creates a new document.
:param db Session: the database connection
:param DocumentDTO document: the values for the new document
:param int org_id: a validated organisation id
:param Optional[str] slug_name: a unique document slug or None
:return str: The import id
"""
try:
Expand Down Expand Up @@ -433,7 +436,7 @@ def create(db: Session, document: DocumentCreateDTO) -> str:
db.add(
Slug(
family_document_import_id=family_doc.import_id,
name=generate_slug(db, document.title),
name=slug_name if slug_name else generate_slug(db, document.title),
)
)
except Exception as e:
Expand Down
54 changes: 38 additions & 16 deletions app/repository/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Helper functions for repos"""

from typing import Union, cast
from typing import Optional, Union, cast
from uuid import uuid4

from db_client.models.dfce.family import Slug
Expand All @@ -10,40 +10,62 @@
from sqlalchemy.orm import Session


def generate_slug(
db: Session,
title: str,
attempts: int = 100,
suffix_length: int = 4,
def generate_unique_slug(
existing_slugs: set[str], title: str, attempts: int = 100, suffix_length: int = 6
) -> str:
"""
Generates a slug that doesn't already exists.
Generates a slug that doesn't already exist.
:param Session db: The connection to the db
:param str title: The title or name of the object you wish to slugify
:param set[str] existing_slugs: A set of already existing slugs to validate against.
:param str title: The title or name of the object you wish to slugify.
:param int attempts: The number of attempt to generate a unique slug before
failing, defaults to 100
:param int suffix_length: The suffix to produce uniqueness, defaults to 4
:raises RuntimeError: If we cannot produce a unique slug
:return str: the slug generated
failing, defaults to 100.
:param int suffix_length: The suffix to produce uniqueness, defaults to 6.
:raises RuntimeError: If we cannot produce a unique slug.
:return str: The slug generated.
"""
lookup = set([cast(str, n) for n in db.query(Slug.name).all()])
base = slugify(str(title))

# TODO: try to extend suffix length if attempts are exhausted
suffix = str(uuid4())[:suffix_length]
count = 0
while (slug := f"{base}_{suffix}") in lookup:
while (slug := f"{base}_{suffix}") in existing_slugs:
count += 1
suffix = str(uuid4())[:suffix_length]
if count > attempts:
raise RuntimeError(
f"Failed to generate a slug for {base} after {attempts} attempts."
)
lookup.add(slug)
existing_slugs.add(slug)
return slug


def generate_slug(
db: Session,
title: str,
attempts: int = 100,
suffix_length: int = 4,
created_slugs: Optional[set[str]] = set(),
) -> str:
"""
Generates a slug for a given title.
:param Session db: The connection to the db
:param str title: The title or name of the object you wish to slugify
:param int attempts: The number of attempt to generate a unique slug before
failing, defaults to 100
:param int suffix_length: The suffix to produce uniqueness, defaults to 4
:param Optional[set[str]] created_slugs: A set of slugs created in the context within which
this function runs that have not yet been committed to the DB
:return str: the slug generated
"""
saved_slugs = set([cast(str, n) for n in db.query(Slug.name).all()])

existing_slugs = created_slugs.union(saved_slugs) if created_slugs else saved_slugs

return generate_unique_slug(existing_slugs, title, attempts, suffix_length)


def generate_import_id(
db: Session, entity_type: CountedEntity, org: Union[str, int]
) -> str:
Expand Down
16 changes: 10 additions & 6 deletions app/service/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
IngestEventDTO,
IngestFamilyDTO,
)
from app.repository.helpers import generate_slug

DOCUMENT_INGEST_LIMIT = 1000
_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -162,6 +163,7 @@ def save_documents(
validation.validate_documents(document_data, corpus_import_id)

document_import_ids = []
document_slugs = set()
total_documents_saved = 0

for doc in document_data:
Expand All @@ -171,7 +173,9 @@ def save_documents(
):
_LOGGER.info(f"Importing document {doc['import_id']}")
dto = IngestDocumentDTO(**doc).to_document_create_dto()
import_id = document_repository.create(db, dto)
slug = generate_slug(db=db, title=dto.title, created_slugs=document_slugs)
import_id = document_repository.create(db, dto, slug)
document_slugs.add(slug)
document_import_ids.append(import_id)
total_documents_saved += 1

Expand Down Expand Up @@ -226,6 +230,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
notification_service.send_notification(
f"🚀 Bulk import for corpus: {corpus_import_id} has started."
)
end_message = ""

# ingest_uuid = uuid4()
# upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data)
Expand Down Expand Up @@ -259,16 +264,15 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:

# upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result)

notification_service.send_notification(
end_message = (
f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed."
)
db.commit()
except Exception as e:
_LOGGER.error(
f"Rolling back transaction due to the following error: {e}", exc_info=True
)
db.rollback()
notification_service.send_notification(
f"💥 Bulk import for corpus: {corpus_import_id} has failed."
)
end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed."
finally:
db.commit()
notification_service.send_notification(end_message)
30 changes: 15 additions & 15 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "admin_backend"
version = "2.17.3"
version = "2.17.4"
description = ""
authors = ["CPR-dev-team <[email protected]>"]
packages = [{ include = "app" }, { include = "tests" }]
Expand Down
61 changes: 61 additions & 0 deletions tests/integration_tests/ingest/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,67 @@ def test_ingest_idempotency(
)


@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
def test_generates_unique_slugs_for_documents_with_identical_titles(
caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client
):
"""
This test ensures that given multiple documents with the same title a unique slug
is generated for each and thus the documents can be saved to the DB at the end
of bulk import. However, the current length of the suffix added to the slug
to ensure uniqueness (6), means that the likelihood of a collision is extremely low.
"""
family_import_id = "test.new.family.0"
test_data = {
"collections": [],
"families": [
{
"import_id": family_import_id,
"title": "Test",
"summary": "Test",
"geographies": ["South Asia"],
"category": "UNFCCC",
"metadata": {"author_type": ["Non-Party"], "author": ["Test"]},
"collections": [],
}
],
"documents": [
{
"import_id": f"test.new.document.{i}",
"family_import_id": family_import_id,
"metadata": {"role": ["MAIN"], "type": ["Law"]},
"variant_name": "Original Language",
"title": "Project Document",
"user_language_name": "",
}
for i in range(1000)
],
"events": [],
}
test_json = json.dumps(test_data).encode("utf-8")
test_data_file = io.BytesIO(test_json)

with caplog.at_level(logging.ERROR):
first_response = client.post(
"/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
files={"new_data": test_data_file},
headers=user_header_token,
)

assert first_response.status_code == status.HTTP_202_ACCEPTED
assert first_response.json() == {
"message": "Bulk import request accepted. Check Cloudwatch logs for result."
}

assert (
"Created"
== data_db.query(FamilyDocument)
.filter(FamilyDocument.import_id == "test.new.document.999")
.one_or_none()
.document_status
)


@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"})
def test_ingest_when_corpus_import_id_invalid(
caplog, data_db: Session, client: TestClient, user_header_token, basic_s3_client
Expand Down
2 changes: 1 addition & 1 deletion tests/mocks/repos/document_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def mock_update(_, import_id: str, data: DocumentReadDTO) -> DocumentReadDTO:
raise exc.NoResultFound()
return create_document_read_dto("a.b.c.d")

def mock_create(_, data: DocumentCreateDTO) -> str:
def mock_create(_, data: DocumentCreateDTO, slug: Optional[str] = "") -> str:
maybe_throw()
if document_repo.return_empty:
raise exc.NoResultFound()
Expand Down
48 changes: 48 additions & 0 deletions tests/unit_tests/repository/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import uuid
from unittest.mock import patch

import pytest

from app.repository.helpers import generate_unique_slug


def test_successfully_generates_a_slug_with_a_four_digit_suffix():
title = "Test title"
generated_slug = generate_unique_slug(set(), title)

expected_slugified_title = "test-title_"
expected_suffix_length = 6

assert expected_slugified_title in generated_slug
assert len(expected_slugified_title) + expected_suffix_length == len(generated_slug)


def test_successfully_generates_a_unique_slug_if_one_already_exists():
title = "Test title"

existing_slug = generate_unique_slug(set(), title)
existing_suffix = existing_slug.split("_")[1]

with patch(
"app.repository.helpers.uuid4",
side_effect=[existing_suffix, uuid.uuid4],
):
generated_slug = generate_unique_slug({existing_slug}, title)

assert existing_slug != generated_slug


def test_raises_error_if_a_unique_slug_cannot_be_created_in_a_specified_number_of_attempts():
title = "Test title"

existing_slug = generate_unique_slug(set(), title)
existing_suffix = existing_slug.split("_")[1]

with (
patch(
"app.repository.helpers.uuid4",
return_value=existing_suffix,
),
pytest.raises(RuntimeError),
):
generate_unique_slug({existing_slug}, title, 2)
Loading

0 comments on commit 969d166

Please sign in to comment.