Skip to content

Commit

Permalink
Comment s3 backup logic back in + test refactor (#246)
Browse files Browse the repository at this point in the history
* Comment s3 backup logic back in + test refactor

* Bump patch version

* Bump patch version again

* Move validation functions to the validation service
  • Loading branch information
annaCPR authored Nov 14, 2024
1 parent 7ebeec7 commit 91cca75
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 224 deletions.
2 changes: 1 addition & 1 deletion app/api/api_v1/routers/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
get_event_template,
get_family_template,
import_data,
validate_ingest_data,
)
from app.service.validation import validate_ingest_data

ingest_router = r = APIRouter()

Expand Down
119 changes: 5 additions & 114 deletions app/service/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
"""

import logging
from enum import Enum
from typing import Any, Optional, Type, TypeVar
from uuid import uuid4

from db_client.models.dfce.collection import Collection
from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent
from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys
from db_client.models.organisation.counters import CountedEntity
from fastapi import HTTPException, status
from pydantic import ConfigDict, validate_call
from sqlalchemy.ext.declarative import DeclarativeMeta
from sqlalchemy.orm import Session
Expand All @@ -28,6 +27,7 @@
import app.service.notification as notification_service
import app.service.taxonomy as taxonomy
import app.service.validation as validation
from app.clients.aws.s3bucket import upload_ingest_json_to_s3
from app.errors import ValidationError
from app.model.ingest import (
IngestCollectionDTO,
Expand All @@ -46,15 +46,6 @@
_LOGGER.setLevel(logging.DEBUG)


class IngestEntityList(str, Enum):
"""Name of the list of entities that can be ingested."""

Collections = "collections"
Families = "families"
Documents = "documents"
Events = "events"


class BaseModel(DeclarativeMeta):
import_id: str

Expand Down Expand Up @@ -332,8 +323,8 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
)
end_message = ""

# ingest_uuid = uuid4()
# upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data)
ingest_uuid = uuid4()
upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data)

_LOGGER.info("Getting DB session")

Expand Down Expand Up @@ -362,7 +353,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
_LOGGER.info("Saving events")
result["events"] = save_events(event_data, corpus_import_id, db)

# upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result)
upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result)

end_message = (
f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed."
Expand All @@ -376,103 +367,3 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None:
end_message = f"💥 Bulk import for corpus: {corpus_import_id} has failed."
finally:
notification_service.send_notification(end_message)


def _collect_import_ids(
entity_list_name: IngestEntityList,
data: dict[str, Any],
import_id_type_name: Optional[str] = None,
) -> list[str]:
"""
Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data.
:param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted.
:param dict[str, Any] data: The data structure containing the entity lists used for extraction.
:param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None.
:return list[str]: A list of extracted import_ids for the specified entity list.
"""
import_id_key = import_id_type_name or "import_id"
import_ids = []
if entity_list_name.value in data:
for entity in data[entity_list_name.value]:
import_ids.append(entity[import_id_key])
return import_ids


def _match_import_ids(
parent_references: list[str], parent_import_ids: set[str]
) -> None:
"""
Validates that all the references to parent entities exist in the set of parent import_ids passed in
:param list[str] parent_references: List of import_ids referencing parent entities to be validated.
:param set[str] parent_import_ids: Set of parent import_ids to validate against.
:raises ValidationError: raised if a parent reference is not found in the parent_import_ids.
"""
for id in parent_references:
if id not in parent_import_ids:
raise ValidationError(f"No entity with id {id} found")


def _validate_collections_exist_for_families(data: dict[str, Any]) -> None:
"""
Validates that collections the families are linked to exist based on import_id links in data.
:param dict[str, Any] data: The data object containing entities to be validated.
"""
collections = _collect_import_ids(IngestEntityList.Collections, data)
collections_set = set(collections)

family_collection_import_ids = []
if "families" in data:
for fam in data["families"]:
family_collection_import_ids.extend(fam["collections"])

_match_import_ids(family_collection_import_ids, collections_set)


def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> None:
"""
Validates that families the documents and events are linked to exist
based on import_id links in data.
:param dict[str, Any] data: The data object containing entities to be validated.
"""
families = _collect_import_ids(IngestEntityList.Families, data)
families_set = set(families)

document_family_import_ids = _collect_import_ids(
IngestEntityList.Documents, data, "family_import_id"
)
event_family_import_ids = _collect_import_ids(
IngestEntityList.Events, data, "family_import_id"
)

_match_import_ids(document_family_import_ids, families_set)
_match_import_ids(event_family_import_ids, families_set)


def validate_entity_relationships(data: dict[str, Any]) -> None:
"""
Validates relationships between entities contained in data.
For documents, it validates that the family the document is linked to exists.
:param dict[str, Any] data: The data object containing entities to be validated.
"""

_validate_collections_exist_for_families(data)
_validate_families_exist_for_events_and_documents(data)


def validate_ingest_data(data: dict[str, Any]) -> None:
"""
Validates data to be ingested.
:param dict[str, Any] data: The data object to be validated.
:raises HTTPException: raised if data is empty or None.
"""

if not data:
raise HTTPException(status_code=status.HTTP_204_NO_CONTENT)

validate_entity_relationships(data)
113 changes: 112 additions & 1 deletion app/service/validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Any
from enum import Enum
from typing import Any, Optional

from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys
from fastapi import HTTPException, status

import app.clients.db.session as db_session
import app.service.category as category
Expand All @@ -12,6 +14,15 @@
from app.service.event import create_event_metadata_object


class IngestEntityList(str, Enum):
"""Name of the list of entities that can be ingested."""

Collections = "collections"
Families = "families"
Documents = "documents"
Events = "events"


def validate_collection(collection: dict[str, Any]) -> None:
"""
Validates a collection.
Expand Down Expand Up @@ -129,3 +140,103 @@ def validate_events(events: list[dict[str, Any]], corpus_import_id: str) -> None
"""
for ev in events:
validate_event(ev, corpus_import_id)


def _collect_import_ids(
entity_list_name: IngestEntityList,
data: dict[str, Any],
import_id_type_name: Optional[str] = None,
) -> list[str]:
"""
Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data.
:param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted.
:param dict[str, Any] data: The data structure containing the entity lists used for extraction.
:param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None.
:return list[str]: A list of extracted import_ids for the specified entity list.
"""
import_id_key = import_id_type_name or "import_id"
import_ids = []
if entity_list_name.value in data:
for entity in data[entity_list_name.value]:
import_ids.append(entity[import_id_key])
return import_ids


def _match_import_ids(
parent_references: list[str], parent_import_ids: set[str]
) -> None:
"""
Validates that all the references to parent entities exist in the set of parent import_ids passed in
:param list[str] parent_references: List of import_ids referencing parent entities to be validated.
:param set[str] parent_import_ids: Set of parent import_ids to validate against.
:raises ValidationError: raised if a parent reference is not found in the parent_import_ids.
"""
for id in parent_references:
if id not in parent_import_ids:
raise ValidationError(f"No entity with id {id} found")


def _validate_collections_exist_for_families(data: dict[str, Any]) -> None:
"""
Validates that collections the families are linked to exist based on import_id links in data.
:param dict[str, Any] data: The data object containing entities to be validated.
"""
collections = _collect_import_ids(IngestEntityList.Collections, data)
collections_set = set(collections)

family_collection_import_ids = []
if "families" in data:
for fam in data["families"]:
family_collection_import_ids.extend(fam["collections"])

_match_import_ids(family_collection_import_ids, collections_set)


def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> None:
"""
Validates that families the documents and events are linked to exist
based on import_id links in data.
:param dict[str, Any] data: The data object containing entities to be validated.
"""
families = _collect_import_ids(IngestEntityList.Families, data)
families_set = set(families)

document_family_import_ids = _collect_import_ids(
IngestEntityList.Documents, data, "family_import_id"
)
event_family_import_ids = _collect_import_ids(
IngestEntityList.Events, data, "family_import_id"
)

_match_import_ids(document_family_import_ids, families_set)
_match_import_ids(event_family_import_ids, families_set)


def validate_entity_relationships(data: dict[str, Any]) -> None:
"""
Validates relationships between entities contained in data.
For documents, it validates that the family the document is linked to exists.
:param dict[str, Any] data: The data object containing entities to be validated.
"""

_validate_collections_exist_for_families(data)
_validate_families_exist_for_events_and_documents(data)


def validate_ingest_data(data: dict[str, Any]) -> None:
"""
Validates data to be ingested.
:param dict[str, Any] data: The data object to be validated.
:raises HTTPException: raised if data is empty or None.
"""

if not data:
raise HTTPException(status_code=status.HTTP_204_NO_CONTENT)

validate_entity_relationships(data)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "admin_backend"
version = "2.17.9"
version = "2.17.10"
description = ""
authors = ["CPR-dev-team <[email protected]>"]
packages = [{ include = "app" }, { include = "tests" }]
Expand Down
11 changes: 8 additions & 3 deletions tests/mocks/repos/corpus_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@

from pytest import MonkeyPatch

from app.errors import ValidationError


def mock_corpus_repo(corpus_repo, monkeypatch: MonkeyPatch, mocker):
corpus_repo.error = False
corpus_repo.valid = True

def mock_get_corpus_org_id(_, __) -> Optional[int]:
if not corpus_repo.error:
return 1
def mock_get_corpus_org_id(_, corpus_org_id) -> Optional[int]:
if corpus_repo.error:
raise ValidationError(
f"No organisation associated with corpus {corpus_org_id}"
)
return 1

def mock_verify_corpus_exists(_, __) -> bool:
return corpus_repo.valid
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/routers/ingest/test_bulk_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from fastapi.testclient import TestClient

from app.errors import ValidationError
from app.service.ingest import validate_entity_relationships
from app.service.validation import validate_entity_relationships


def test_ingest_when_not_authenticated(client: TestClient):
Expand Down
Loading

0 comments on commit 91cca75

Please sign in to comment.