From 91cca7557fd57cc8fe4e507bb83d532b23f71fe3 Mon Sep 17 00:00:00 2001 From: annaCPR Date: Thu, 14 Nov 2024 16:21:05 +0000 Subject: [PATCH] Comment s3 backup logic back in + test refactor (#246) * Comment s3 backup logic back in + test refactor * Bump patch version * Bump patch version again * Move validation functions to the validation service --- app/api/api_v1/routers/ingest.py | 2 +- app/service/ingest.py | 119 +------------- app/service/validation.py | 113 ++++++++++++- pyproject.toml | 2 +- tests/mocks/repos/corpus_repo.py | 11 +- .../routers/ingest/test_bulk_ingest.py | 2 +- .../service/ingest/test_ingest_service.py | 154 ++++++------------ 7 files changed, 179 insertions(+), 224 deletions(-) diff --git a/app/api/api_v1/routers/ingest.py b/app/api/api_v1/routers/ingest.py index f6ae7106..b39829ae 100644 --- a/app/api/api_v1/routers/ingest.py +++ b/app/api/api_v1/routers/ingest.py @@ -18,8 +18,8 @@ get_event_template, get_family_template, import_data, - validate_ingest_data, ) +from app.service.validation import validate_ingest_data ingest_router = r = APIRouter() diff --git a/app/service/ingest.py b/app/service/ingest.py index ff3ce4bc..84288ca2 100644 --- a/app/service/ingest.py +++ b/app/service/ingest.py @@ -6,14 +6,13 @@ """ import logging -from enum import Enum from typing import Any, Optional, Type, TypeVar +from uuid import uuid4 from db_client.models.dfce.collection import Collection from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys from db_client.models.organisation.counters import CountedEntity -from fastapi import HTTPException, status from pydantic import ConfigDict, validate_call from sqlalchemy.ext.declarative import DeclarativeMeta from sqlalchemy.orm import Session @@ -28,6 +27,7 @@ import app.service.notification as notification_service import app.service.taxonomy as taxonomy import app.service.validation as validation +from app.clients.aws.s3bucket import upload_ingest_json_to_s3 from app.errors import ValidationError from app.model.ingest import ( IngestCollectionDTO, @@ -46,15 +46,6 @@ _LOGGER.setLevel(logging.DEBUG) -class IngestEntityList(str, Enum): - """Name of the list of entities that can be ingested.""" - - Collections = "collections" - Families = "families" - Documents = "documents" - Events = "events" - - class BaseModel(DeclarativeMeta): import_id: str @@ -332,8 +323,8 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: ) end_message = "" - # ingest_uuid = uuid4() - # upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data) + ingest_uuid = uuid4() + upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data) _LOGGER.info("Getting DB session") @@ -362,7 +353,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: _LOGGER.info("Saving events") result["events"] = save_events(event_data, corpus_import_id, db) - # upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result) + upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result) end_message = ( f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed." @@ -376,103 +367,3 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed." finally: notification_service.send_notification(end_message) - - -def _collect_import_ids( - entity_list_name: IngestEntityList, - data: dict[str, Any], - import_id_type_name: Optional[str] = None, -) -> list[str]: - """ - Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data. - - :param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted. - :param dict[str, Any] data: The data structure containing the entity lists used for extraction. - :param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None. - :return list[str]: A list of extracted import_ids for the specified entity list. - """ - import_id_key = import_id_type_name or "import_id" - import_ids = [] - if entity_list_name.value in data: - for entity in data[entity_list_name.value]: - import_ids.append(entity[import_id_key]) - return import_ids - - -def _match_import_ids( - parent_references: list[str], parent_import_ids: set[str] -) -> None: - """ - Validates that all the references to parent entities exist in the set of parent import_ids passed in - - :param list[str] parent_references: List of import_ids referencing parent entities to be validated. - :param set[str] parent_import_ids: Set of parent import_ids to validate against. - :raises ValidationError: raised if a parent reference is not found in the parent_import_ids. - """ - for id in parent_references: - if id not in parent_import_ids: - raise ValidationError(f"No entity with id {id} found") - - -def _validate_collections_exist_for_families(data: dict[str, Any]) -> None: - """ - Validates that collections the families are linked to exist based on import_id links in data. - - :param dict[str, Any] data: The data object containing entities to be validated. - """ - collections = _collect_import_ids(IngestEntityList.Collections, data) - collections_set = set(collections) - - family_collection_import_ids = [] - if "families" in data: - for fam in data["families"]: - family_collection_import_ids.extend(fam["collections"]) - - _match_import_ids(family_collection_import_ids, collections_set) - - -def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> None: - """ - Validates that families the documents and events are linked to exist - based on import_id links in data. - - :param dict[str, Any] data: The data object containing entities to be validated. - """ - families = _collect_import_ids(IngestEntityList.Families, data) - families_set = set(families) - - document_family_import_ids = _collect_import_ids( - IngestEntityList.Documents, data, "family_import_id" - ) - event_family_import_ids = _collect_import_ids( - IngestEntityList.Events, data, "family_import_id" - ) - - _match_import_ids(document_family_import_ids, families_set) - _match_import_ids(event_family_import_ids, families_set) - - -def validate_entity_relationships(data: dict[str, Any]) -> None: - """ - Validates relationships between entities contained in data. - For documents, it validates that the family the document is linked to exists. - - :param dict[str, Any] data: The data object containing entities to be validated. - """ - - _validate_collections_exist_for_families(data) - _validate_families_exist_for_events_and_documents(data) - - -def validate_ingest_data(data: dict[str, Any]) -> None: - """ - Validates data to be ingested. - - :param dict[str, Any] data: The data object to be validated. - :raises HTTPException: raised if data is empty or None. - """ - - if not data: - raise HTTPException(status_code=status.HTTP_204_NO_CONTENT) - - validate_entity_relationships(data) diff --git a/app/service/validation.py b/app/service/validation.py index c0ed4324..88e67668 100644 --- a/app/service/validation.py +++ b/app/service/validation.py @@ -1,6 +1,8 @@ -from typing import Any +from enum import Enum +from typing import Any, Optional from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys +from fastapi import HTTPException, status import app.clients.db.session as db_session import app.service.category as category @@ -12,6 +14,15 @@ from app.service.event import create_event_metadata_object +class IngestEntityList(str, Enum): + """Name of the list of entities that can be ingested.""" + + Collections = "collections" + Families = "families" + Documents = "documents" + Events = "events" + + def validate_collection(collection: dict[str, Any]) -> None: """ Validates a collection. @@ -129,3 +140,103 @@ def validate_events(events: list[dict[str, Any]], corpus_import_id: str) -> None """ for ev in events: validate_event(ev, corpus_import_id) + + +def _collect_import_ids( + entity_list_name: IngestEntityList, + data: dict[str, Any], + import_id_type_name: Optional[str] = None, +) -> list[str]: + """ + Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data. + + :param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted. + :param dict[str, Any] data: The data structure containing the entity lists used for extraction. + :param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None. + :return list[str]: A list of extracted import_ids for the specified entity list. + """ + import_id_key = import_id_type_name or "import_id" + import_ids = [] + if entity_list_name.value in data: + for entity in data[entity_list_name.value]: + import_ids.append(entity[import_id_key]) + return import_ids + + +def _match_import_ids( + parent_references: list[str], parent_import_ids: set[str] +) -> None: + """ + Validates that all the references to parent entities exist in the set of parent import_ids passed in + + :param list[str] parent_references: List of import_ids referencing parent entities to be validated. + :param set[str] parent_import_ids: Set of parent import_ids to validate against. + :raises ValidationError: raised if a parent reference is not found in the parent_import_ids. + """ + for id in parent_references: + if id not in parent_import_ids: + raise ValidationError(f"No entity with id {id} found") + + +def _validate_collections_exist_for_families(data: dict[str, Any]) -> None: + """ + Validates that collections the families are linked to exist based on import_id links in data. + + :param dict[str, Any] data: The data object containing entities to be validated. + """ + collections = _collect_import_ids(IngestEntityList.Collections, data) + collections_set = set(collections) + + family_collection_import_ids = [] + if "families" in data: + for fam in data["families"]: + family_collection_import_ids.extend(fam["collections"]) + + _match_import_ids(family_collection_import_ids, collections_set) + + +def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> None: + """ + Validates that families the documents and events are linked to exist + based on import_id links in data. + + :param dict[str, Any] data: The data object containing entities to be validated. + """ + families = _collect_import_ids(IngestEntityList.Families, data) + families_set = set(families) + + document_family_import_ids = _collect_import_ids( + IngestEntityList.Documents, data, "family_import_id" + ) + event_family_import_ids = _collect_import_ids( + IngestEntityList.Events, data, "family_import_id" + ) + + _match_import_ids(document_family_import_ids, families_set) + _match_import_ids(event_family_import_ids, families_set) + + +def validate_entity_relationships(data: dict[str, Any]) -> None: + """ + Validates relationships between entities contained in data. + For documents, it validates that the family the document is linked to exists. + + :param dict[str, Any] data: The data object containing entities to be validated. + """ + + _validate_collections_exist_for_families(data) + _validate_families_exist_for_events_and_documents(data) + + +def validate_ingest_data(data: dict[str, Any]) -> None: + """ + Validates data to be ingested. + + :param dict[str, Any] data: The data object to be validated. + :raises HTTPException: raised if data is empty or None. + """ + + if not data: + raise HTTPException(status_code=status.HTTP_204_NO_CONTENT) + + validate_entity_relationships(data) diff --git a/pyproject.toml b/pyproject.toml index 618e1324..0ea054ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "admin_backend" -version = "2.17.9" +version = "2.17.10" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] diff --git a/tests/mocks/repos/corpus_repo.py b/tests/mocks/repos/corpus_repo.py index 4eea624d..b64f242d 100644 --- a/tests/mocks/repos/corpus_repo.py +++ b/tests/mocks/repos/corpus_repo.py @@ -2,14 +2,19 @@ from pytest import MonkeyPatch +from app.errors import ValidationError + def mock_corpus_repo(corpus_repo, monkeypatch: MonkeyPatch, mocker): corpus_repo.error = False corpus_repo.valid = True - def mock_get_corpus_org_id(_, __) -> Optional[int]: - if not corpus_repo.error: - return 1 + def mock_get_corpus_org_id(_, corpus_org_id) -> Optional[int]: + if corpus_repo.error: + raise ValidationError( + f"No organisation associated with corpus {corpus_org_id}" + ) + return 1 def mock_verify_corpus_exists(_, __) -> bool: return corpus_repo.valid diff --git a/tests/unit_tests/routers/ingest/test_bulk_ingest.py b/tests/unit_tests/routers/ingest/test_bulk_ingest.py index e36d04f9..b2b700c8 100644 --- a/tests/unit_tests/routers/ingest/test_bulk_ingest.py +++ b/tests/unit_tests/routers/ingest/test_bulk_ingest.py @@ -14,7 +14,7 @@ from fastapi.testclient import TestClient from app.errors import ValidationError -from app.service.ingest import validate_entity_relationships +from app.service.validation import validate_entity_relationships def test_ingest_when_not_authenticated(client: TestClient): diff --git a/tests/unit_tests/service/ingest/test_ingest_service.py b/tests/unit_tests/service/ingest/test_ingest_service.py index 81245286..2eb1407c 100644 --- a/tests/unit_tests/service/ingest/test_ingest_service.py +++ b/tests/unit_tests/service/ingest/test_ingest_service.py @@ -9,120 +9,72 @@ from app.errors import ValidationError -@pytest.mark.skip("This code tests S3 and needs reinstating when that work is done.") -@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) +@patch("app.service.ingest.uuid4", Mock(return_value="1111-1111")) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -def test_ingest_when_ok( - basic_s3_client, - corpus_repo_mock, - geography_repo_mock, - collection_repo_mock, - family_repo_mock, - document_repo_mock, - event_repo_mock, - validation_service_mock, +@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) +def test_input_json_and_result_saved_to_s3_on_bulk_import( + basic_s3_client, validation_service_mock, corpus_repo_mock, collection_repo_mock ): bucket_name = "test_bucket" - test_data = { + json_data = { "collections": [ { "import_id": "test.new.collection.0", "title": "Test title", "description": "Test description", - }, - ], - "families": [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geographies": ["Test"], - "category": "UNFCCC", - "metadata": {"color": ["blue"], "size": [""]}, - "collections": ["test.new.collection.0"], - }, - ], - "documents": [ - { - "import_id": "test.new.document.0", - "family_import_id": "test.new.family.0", - "variant_name": "Original Language", - "metadata": {"color": ["blue"]}, - "title": "", - "source_url": None, - "user_language_name": "", - } - ], - "events": [ - { - "import_id": "test.new.event.0", - "family_import_id": "test.new.family.0", - "event_title": "Test", - "date": "2000-01-01T00:00:00.000Z", - "event_type_value": "Amended", } - ], - } - - expected_ingest_result = { - "collections": ["test.new.collection.0"], - "families": ["test.new.family.0"], - "documents": ["test.new.document.0"], - "events": ["test.new.event.0"], + ] } - try: - with ( - patch( - "app.service.ingest.notification_service.send_notification" - ) as mock_notification_service, - ): - ingest_service.import_data(test_data, "test_corpus_id") - - response = basic_s3_client.list_objects_v2( - Bucket=bucket_name, Prefix="1111-1111-result-test_corpus_id" - ) - - assert 2 == mock_notification_service.call_count - mock_notification_service.assert_called_with( - "🎉 Bulk import for corpus: test_corpus_id successfully completed." - ) + ingest_service.import_data(json_data, "test_corpus_id") - objects = response["Contents"] - assert len(objects) == 1 + bulk_import_input_json = basic_s3_client.list_objects_v2( + Bucket=bucket_name, Prefix="1111-1111-result-test_corpus_id" + ) + objects = bulk_import_input_json["Contents"] + assert len(objects) == 1 - key = objects[0]["Key"] - response = basic_s3_client.get_object(Bucket=bucket_name, Key=key) - body = response["Body"].read().decode("utf-8") - assert expected_ingest_result == json.loads(body) - except Exception as e: - assert False, f"import_data in ingest service raised an exception: {e}" + key = objects[0]["Key"] + bulk_import_result = basic_s3_client.get_object(Bucket=bucket_name, Key=key) + body = bulk_import_result["Body"].read().decode("utf-8") + assert {"collections": ["test.new.collection.0"]} == json.loads(body) +@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -def test_import_data_when_data_invalid(caplog, basic_s3_client): +def test_slack_notification_sent_on_success( + basic_s3_client, + corpus_repo_mock, + collection_repo_mock, + validation_service_mock, +): test_data = { "collections": [ { - "import_id": "invalid", + "import_id": "test.new.collection.0", "title": "Test title", "description": "Test description", } - ] + ], } - with caplog.at_level(logging.ERROR): - ingest_service.import_data(test_data, "test") + with ( + patch( + "app.service.ingest.notification_service.send_notification" + ) as mock_notification_service, + ): + ingest_service.import_data(test_data, "test_corpus_id") - assert "The import id invalid is invalid!" in caplog.text + assert 2 == mock_notification_service.call_count + mock_notification_service.assert_called_with( + "🎉 Bulk import for corpus: test_corpus_id successfully completed." + ) @patch("app.service.ingest._exists_in_db", Mock(return_value=False)) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -def test_ingest_when_db_error( - caplog, basic_s3_client, corpus_repo_mock, collection_repo_mock -): - collection_repo_mock.throw_repository_error = True +def test_slack_notification_sent_on_error(caplog, basic_s3_client, corpus_repo_mock): + corpus_repo_mock.error = True test_data = { "collections": [ @@ -147,31 +99,27 @@ def test_ingest_when_db_error( "💥 Bulk import for corpus: test has failed." ) assert ( - "Rolling back transaction due to the following error: bad collection repo" + "Rolling back transaction due to the following error: No organisation associated with corpus test" in caplog.text ) -@pytest.mark.skip("This code tests S3 and needs reinstating when that work is done.") @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -def test_request_json_saved_to_s3_on_ingest(basic_s3_client): - bucket_name = "test_bucket" - json_data = {"key": "value"} - - ingest_service.import_data({"key": "value"}, "test_corpus_id") - - response = basic_s3_client.list_objects_v2( - Bucket=bucket_name, Prefix="1111-1111-request-test_corpus_id" - ) +def test_import_data_when_data_invalid(caplog, basic_s3_client): + test_data = { + "collections": [ + { + "import_id": "invalid", + "title": "Test title", + "description": "Test description", + } + ] + } - assert "Contents" in response - objects = response["Contents"] - assert len(objects) == 1 + with caplog.at_level(logging.ERROR): + ingest_service.import_data(test_data, "test") - key = objects[0]["Key"] - response = basic_s3_client.get_object(Bucket=bucket_name, Key=key) - body = response["Body"].read().decode("utf-8") - assert json.loads(body) == json_data + assert "The import id invalid is invalid!" in caplog.text def test_save_families_when_corpus_invalid(corpus_repo_mock, validation_service_mock):