Fix ingest route permissions (#244)

* Update CorpusData to handle entity specific taxonomies * Add corpora get, all, search and update endpoints * Add searching within corpus_text * Fix corpus update * Remove rogue debug * Add search tests for /corpora * Add get tests for /corpora * Add all tests for /corpora * Remove debug * Add corpora info to setup_db * Change function signatures from doc to corpus * Linting fixes * Bump to 2.17.8 * Use validate instead of verify * Fix repo function name * Remove unused code * Bump ruff * Remove created and last modified for now * Add corpus auth * Ingest auth access should be admin not user * Make test tokens non admin * Driveby fix tests for ingest * Make user accounts match test tokens * Move ingest business logic out of router * Add todos for timestamps * Revert "Ingest auth access should be admin not user" This reverts commit 347a6eb. * Add tests for checking user auth * Update article based on next word starting vowel * Update test_ingest.py * Fix import path * Bump to 2.17.9 * Make ingest route for su only * Update article * Update ingest routes to be super not admin
climatepolicyradar · Nov 14, 2024 · 7ebeec7 · 7ebeec7
1 parent 86cc83b
commit 7ebeec7
Show file tree

Hide file tree

Showing 12 changed files with 375 additions and 249 deletions.
diff --git a/app/api/api_v1/routers/ingest.py b/app/api/api_v1/routers/ingest.py
@@ -1,115 +1,31 @@
 import json
 import logging
-from enum import Enum
-from typing import Any, Optional
 
-from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys
-from db_client.models.organisation.counters import CountedEntity
-from fastapi import APIRouter, BackgroundTasks, HTTPException, UploadFile, status
+from fastapi import (
+    APIRouter,
+    BackgroundTasks,
+    HTTPException,
+    Request,
+    UploadFile,
+    status,
+)
 
-import app.service.taxonomy as taxonomy
 from app.errors import ValidationError
 from app.model.general import Json
-from app.model.ingest import (
-    IngestCollectionDTO,
-    IngestDocumentDTO,
-    IngestEventDTO,
-    IngestFamilyDTO,
+from app.service.ingest import (
+    get_collection_template,
+    get_document_template,
+    get_event_template,
+    get_family_template,
+    import_data,
+    validate_ingest_data,
 )
-from app.service.ingest import import_data
 
 ingest_router = r = APIRouter()
 
 _LOGGER = logging.getLogger(__name__)
 
 
-def _get_collection_template() -> dict:
-    """
-    Gets a collection template.
-
-    :return dict: The collection template.
-    """
-    collection_schema = IngestCollectionDTO.model_json_schema(mode="serialization")
-    collection_template = collection_schema["properties"]
-
-    return collection_template
-
-
-def _get_event_template(corpus_type: str) -> dict:
-    """
-    Gets an event template.
-
-    :return dict: The event template.
-    """
-    event_schema = IngestEventDTO.model_json_schema(mode="serialization")
-    event_template = event_schema["properties"]
-
-    event_meta = _get_metadata_template(corpus_type, CountedEntity.Event)
-
-    # TODO: Replace with event_template["metadata"] in PDCT-1622
-    if "event_type" not in event_meta:
-        raise ValidationError("Bad taxonomy in database")
-    event_template["event_type_value"] = event_meta["event_type"]
-
-    return event_template
-
-
-def _get_document_template(corpus_type: str) -> dict:
-    """
-    Gets a document template for a given corpus type.
-
-    :param str corpus_type: The corpus_type to use to get the document template.
-    :return dict: The document template.
-    """
-    document_schema = IngestDocumentDTO.model_json_schema(mode="serialization")
-    document_template = document_schema["properties"]
-    document_template["metadata"] = _get_metadata_template(
-        corpus_type, CountedEntity.Document
-    )
-
-    return document_template
-
-
-def _get_metadata_template(corpus_type: str, metadata_type: CountedEntity) -> dict:
-    """
-    Gets a metadata template for a given corpus type and entity.
-
-    :param str corpus_type: The corpus_type to use to get the metadata template.
-    :param str metadata_type: The metadata_type to use to get the metadata template.
-    :return dict: The metadata template.
-    """
-    metadata = taxonomy.get(corpus_type)
-    if not metadata:
-        return {}
-    if metadata_type == CountedEntity.Document:
-        return metadata.pop(EntitySpecificTaxonomyKeys.DOCUMENT.value)
-    elif metadata_type == CountedEntity.Event:
-        return metadata.pop(EntitySpecificTaxonomyKeys.EVENT.value)
-    elif metadata_type == CountedEntity.Family:
-        metadata.pop(EntitySpecificTaxonomyKeys.DOCUMENT.value)
-        metadata.pop(EntitySpecificTaxonomyKeys.EVENT.value)
-        metadata.pop("event_type")  # TODO: Remove as part of PDCT-1622
-    return metadata
-
-
-def _get_family_template(corpus_type: str) -> dict:
-    """
-    Gets a family template for a given corpus type.
-
-    :param str corpus_type: The corpus_type to use to get the family template.
-    :return dict: The family template.
-    """
-    family_schema = IngestFamilyDTO.model_json_schema(mode="serialization")
-    family_template = family_schema["properties"]
-
-    del family_template["corpus_import_id"]
-
-    family_metadata = _get_metadata_template(corpus_type, CountedEntity.Family)
-    family_template["metadata"] = family_metadata
-
-    return family_template
-
-
 @r.get(
     "/ingest/template/{corpus_type}",
     response_model=Json,
@@ -127,145 +43,41 @@ async def get_ingest_template(corpus_type: str) -> Json:
 
     try:
         return {
-            "collections": [_get_collection_template()],
-            "families": [_get_family_template(corpus_type)],
-            "documents": [_get_document_template(corpus_type)],
-            "events": [_get_event_template(corpus_type)],
+            "collections": [get_collection_template()],
+            "families": [get_family_template(corpus_type)],
+            "documents": [get_document_template(corpus_type)],
+            "events": [get_event_template(corpus_type)],
         }
     except ValidationError as e:
         _LOGGER.error(e)
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=e.message)
 
 
-class IngestEntityList(str, Enum):
-    """Name of the list of entities that can be ingested."""
-
-    Collections = "collections"
-    Families = "families"
-    Documents = "documents"
-    Events = "events"
-
-
-def _collect_import_ids(
-    entity_list_name: IngestEntityList,
-    data: dict[str, Any],
-    import_id_type_name: Optional[str] = None,
-) -> list[str]:
-    """
-    Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data.
-
-    :param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted.
-    :param dict[str, Any] data: The data structure containing the entity lists used for extraction.
-    :param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None.
-    :return list[str]: A list of extracted import_ids for the specified entity list.
-    """
-    import_id_key = import_id_type_name or "import_id"
-    import_ids = []
-    if entity_list_name.value in data:
-        for entity in data[entity_list_name.value]:
-            import_ids.append(entity[import_id_key])
-    return import_ids
-
-
-def _match_import_ids(
-    parent_references: list[str], parent_import_ids: set[str]
-) -> None:
-    """
-    Validates that all the references to parent entities exist in the set of parent import_ids passed in
-
-    :param list[str] parent_references: List of import_ids referencing parent entities to be validated.
-    :param set[str] parent_import_ids: Set of parent import_ids to validate against.
-    :raises ValidationError: raised if a parent reference is not found in the parent_import_ids.
-    """
-    for id in parent_references:
-        if id not in parent_import_ids:
-            raise ValidationError(f"No entity with id {id} found")
-
-
-def _validate_collections_exist_for_families(data: dict[str, Any]) -> None:
-    """
-    Validates that collections the families are linked to exist based on import_id links in data.
-
-    :param dict[str, Any] data: The data object containing entities to be validated.
-    """
-    collections = _collect_import_ids(IngestEntityList.Collections, data)
-    collections_set = set(collections)
-
-    family_collection_import_ids = []
-    if "families" in data:
-        for fam in data["families"]:
-            family_collection_import_ids.extend(fam["collections"])
-
-    _match_import_ids(family_collection_import_ids, collections_set)
-
-
-def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> None:
-    """
-    Validates that families the documents and events are linked to exist
-    based on import_id links in data.
-
-    :param dict[str, Any] data: The data object containing entities to be validated.
-    """
-    families = _collect_import_ids(IngestEntityList.Families, data)
-    families_set = set(families)
-
-    document_family_import_ids = _collect_import_ids(
-        IngestEntityList.Documents, data, "family_import_id"
-    )
-    event_family_import_ids = _collect_import_ids(
-        IngestEntityList.Events, data, "family_import_id"
-    )
-
-    _match_import_ids(document_family_import_ids, families_set)
-    _match_import_ids(event_family_import_ids, families_set)
-
-
-def validate_entity_relationships(data: dict[str, Any]) -> None:
-    """
-    Validates relationships between entities contained in data.
-    For documents, it validates that the family the document is linked to exists.
-
-    :param dict[str, Any] data: The data object containing entities to be validated.
-    """
-
-    _validate_collections_exist_for_families(data)
-    _validate_families_exist_for_events_and_documents(data)
-
-
-def _validate_ingest_data(data: dict[str, Any]) -> None:
-    """
-    Validates data to be ingested.
-
-    :param dict[str, Any] data: The data object to be validated.
-    :raises HTTPException: raised if data is empty or None.
-    """
-
-    if not data:
-        raise HTTPException(status_code=status.HTTP_204_NO_CONTENT)
-
-    validate_entity_relationships(data)
-
-
 @r.post(
     "/ingest/{corpus_import_id}",
     response_model=Json,
     status_code=status.HTTP_202_ACCEPTED,
 )
 async def ingest(
-    new_data: UploadFile, corpus_import_id: str, background_tasks: BackgroundTasks
+    request: Request,
+    new_data: UploadFile,
+    corpus_import_id: str,
+    background_tasks: BackgroundTasks,
 ) -> Json:
     """
     Bulk import endpoint.
 
     :param UploadFile new_data: file containing json representation of data to ingest.
     :return Json: json representation of the data to ingest.
     """
-    _LOGGER.info(f"Received bulk import request for corpus: {corpus_import_id}")
+    _LOGGER.info(
+        f"User {request.state.user} triggered bulk import for corpus: {corpus_import_id}"
+    )
 
     try:
         content = await new_data.read()
         data_dict = json.loads(content)
-        _validate_ingest_data(data_dict)
+        validate_ingest_data(data_dict)
 
         background_tasks.add_task(import_data, data_dict, corpus_import_id)
 

diff --git a/app/model/authorisation.py b/app/model/authorisation.py
@@ -63,8 +63,8 @@ class AuthEndpoint(str, enum.Enum):
     },
     # Ingest
     AuthEndpoint.INGEST: {
-        AuthOperation.CREATE: AuthAccess.USER,
-        AuthOperation.READ: AuthAccess.USER,
+        AuthOperation.CREATE: AuthAccess.SUPER,
+        AuthOperation.READ: AuthAccess.SUPER,
     },
     # Corpus
     AuthEndpoint.CORPUS: {

diff --git a/app/service/authorisation.py b/app/service/authorisation.py
@@ -70,4 +70,13 @@ def is_authorised(user: UserContext, entity: AuthEndpoint, op: AuthOperation) ->
     if _has_access(required_access, _get_user_access(user)):
         return
 
-    raise AuthorisationError(f"User {user.email} is not authorised to {op} a {entity}")
+    raise AuthorisationError(
+        f"User {user.email} is not authorised to {op} {_get_article(entity.value)} {entity}"
+    )
+
+
+def _get_article(word: str) -> str:
+    vowels = ["a", "e", "i", "o", "u", "y"]
+    if word.lower()[0] in vowels:
+        return "an"
+    return "a"