From 277e67430e3ec6c80ec07503ae997a9c0569b89f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:06:59 +0000 Subject: [PATCH 01/10] Make documents router dependent on app token --- app/api/api_v1/routers/documents.py | 20 +- app/api/api_v1/routers/search.py | 2 - app/repository/document.py | 27 +- .../routers/documents/setup_doc_fam_lookup.py | 24 + .../documents/test_admin_doc_routes.py | 936 +++++++++++++++++ .../documents/test_document_families.py | 973 +----------------- .../documents/test_get_document_families.py | 132 +-- 7 files changed, 1070 insertions(+), 1044 deletions(-) create mode 100644 tests/non_search/routers/documents/setup_doc_fam_lookup.py create mode 100644 tests/non_search/routers/documents/test_admin_doc_routes.py diff --git a/app/api/api_v1/routers/documents.py b/app/api/api_v1/routers/documents.py index 0cf92c19..1d48c278 100644 --- a/app/api/api_v1/routers/documents.py +++ b/app/api/api_v1/routers/documents.py @@ -1,8 +1,8 @@ import logging from http.client import NOT_FOUND -from typing import Union +from typing import Annotated, Union -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, Header, HTTPException, Request from app.clients.db.session import get_db from app.models.document import ( @@ -14,6 +14,7 @@ get_family_document_and_context, get_slugged_objects, ) +from app.service.custom_app import AppTokenFactory _LOGGER = logging.getLogger(__file__) @@ -28,20 +29,23 @@ ], ) async def family_or_document_detail( - slug: str, - db=Depends(get_db), + slug: str, request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ): """Get details of the family or document associated with the slug.""" _LOGGER.info( f"Getting detailed information for family or document '{slug}'", extra={ - "props": { - "import_id_or_slug": slug, - }, + "props": {"import_id_or_slug": slug, "app_token": str(app_token)}, }, ) - family_document_import_id, family_import_id = get_slugged_objects(db, slug) + # Decode the app token and validate it. + token = AppTokenFactory() + token.decode_and_validate(db, request, app_token) + + family_document_import_id, family_import_id = get_slugged_objects( + db, slug, token.allowed_corpora_ids + ) if family_document_import_id is None and family_import_id is None: raise HTTPException(status_code=NOT_FOUND, detail=f"Nothing found for {slug}") diff --git a/app/api/api_v1/routers/search.py b/app/api/api_v1/routers/search.py index cb5ebe87..a7749c33 100644 --- a/app/api/api_v1/routers/search.py +++ b/app/api/api_v1/routers/search.py @@ -277,8 +277,6 @@ def download_all_search_documents( request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ) -> RedirectResponse: """Download a CSV containing details of all the documents in the corpus.""" - token = AppTokenFactory() - _LOGGER.info( "Whole data download request", extra={ diff --git a/app/repository/document.py b/app/repository/document.py index 519735dd..c97d9744 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -40,20 +40,27 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_objects(db: Session, slug: str) -> tuple[Optional[str], Optional[str]]: - """ - Matches the slug name to a FamilyDocument or Family import_id +def get_slugged_objects( + db: Session, slug: str, allowed_corpora: Optional[list[str]] = None +) -> tuple[Optional[str], Optional[str]]: + """Match the slug name to a FamilyDocument or Family import ID. :param Session db: connection to db :param str slug: slug name to match - :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or - the Family import_id + :param Optional[list[str]] allowed_corpora: The corpora IDs to look + for the slugged object in. + :return tuple[Optional[str], Optional[str]]: the FamilyDocument + import id or the Family import_id. """ - result = ( - db.query(Slug.family_document_import_id, Slug.family_import_id).filter( - Slug.name == slug - ) - ).one_or_none() + query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( + Slug.name == slug + ) + + # # Only get slug for the fam/doc if it belongs to the list of allowed corpora. + # if allowed_corpora is not None: + # query = query.filter() + + result = query.one_or_none() if result is None: return (None, None) return result diff --git a/tests/non_search/routers/documents/setup_doc_fam_lookup.py b/tests/non_search/routers/documents/setup_doc_fam_lookup.py new file mode 100644 index 00000000..7530e88c --- /dev/null +++ b/tests/non_search/routers/documents/setup_doc_fam_lookup.py @@ -0,0 +1,24 @@ +from typing import Optional + +from fastapi import status + +DOCUMENTS_ENDPOINT = "/api/v1/documents" +TEST_HOST = "http://localhost:3000/" + + +def _make_doc_fam_lookup_request( + client, + token, + slug: str, + expected_status_code: int = status.HTTP_200_OK, + origin: Optional[str] = TEST_HOST, +): + headers = ( + {"app-token": token} + if origin is None + else {"app-token": token, "origin": origin} + ) + + response = client.get(f"{DOCUMENTS_ENDPOINT}/{slug}", headers=headers) + assert response.status_code == expected_status_code, response.text + return response.json() diff --git a/tests/non_search/routers/documents/test_admin_doc_routes.py b/tests/non_search/routers/documents/test_admin_doc_routes.py new file mode 100644 index 00000000..4674620c --- /dev/null +++ b/tests/non_search/routers/documents/test_admin_doc_routes.py @@ -0,0 +1,936 @@ +import logging +from typing import Optional + +import pytest +from db_client.models.dfce.family import FamilyDocument +from db_client.models.document.physical_document import ( + Language, + LanguageSource, + PhysicalDocumentLanguage, +) +from fastapi import status +from fastapi.testclient import TestClient +from sqlalchemy.orm import Session + +from tests.non_search.setup_helpers import ( + setup_docs_with_two_orgs, + setup_docs_with_two_orgs_no_langs, + setup_with_two_docs, + setup_with_two_docs_bad_ids, + setup_with_two_unpublished_docs, +) + + +def test_update_document_status__is_secure( + data_client: TestClient, + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed") + assert response.status_code == 401 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.12", + "UNFCCC.s.ill.y.2.2", + ], +) +def test_update_document_status__fails_on_non_matching_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_with_two_docs_bad_ids(data_db) + + response = data_client.post( + f"/api/v1/admin/documents/{import_id}/processed", + headers=data_superuser_token_headers, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +def test_update_document_status__publishes_document( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + """Test that we can send a payload to the backend to update family_document.document_status""" + setup_with_two_unpublished_docs(data_db) + UPDATE_IMPORT_ID = "CCLW.executive.1.2" + UNCHANGED_IMPORT_ID = "CCLW.executive.2.2" + + # State of db beforehand + pre_family_status = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) + .one() + .document_status + ) + + response = data_client.post( + f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed", + headers=data_superuser_token_headers, + ) + + assert response.status_code == 200 + json_object = response.json() + + assert json_object["import_id"] == UPDATE_IMPORT_ID + assert json_object["document_status"] == "Published" + + # Now Check the db + updated_family = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) + .one() + ) + assert updated_family.document_status == "Published" + assert updated_family.document_status != pre_family_status + + unchanged_family = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID) + .one() + ) + assert unchanged_family.document_status == "Deleted" + + +def test_update_document__is_secure( + data_client: TestClient, + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + payload = { + "md5sum": "abc123", + "content_type": "content_type", + "source_url": "source_url", + } + + response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload) + + assert response.status_code == 401 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.12", + "UNFCCC.s.ill.y.2.2", + ], +) +def test_update_document__fails_on_non_matching_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_with_two_docs_bad_ids(data_db) + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_docs_with_two_orgs(data_db) + + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_new_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["eng"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["eng"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "eng" + + # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["fra"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["eng", "fra"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 2 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_new_iso_639_1_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["el"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = set(["ell", "bod"]) + assert set(json_object["languages"]) == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 2 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +# TODO Parametrize this test with the two languages as parameters +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_existing_iso_639_1_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """ + Assert that we can skip over existing languages for documents when using two letter iso codes. + + Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a + language entry for that physical document we don't throw an error. This proves that we can detect that the + two-letter iso code language already exists. + """ + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bod"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["bod"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 1 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_existing_iso_639_3_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """ + Assert that we can skip over existing languages for documents when using three letter iso codes. + + Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a + language entry for that physical document we don't throw an error. This proves that we can detect that the + three-letter iso code language already exists. + """ + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bod"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["bod"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 1 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__logs_warning_on_four_letter_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + caplog, +): + """Send a payload to assert that languages that are too long aren't added and a warning is logged.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # Payload with a four letter language code that won't exist in the db + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["boda"], + } + + with caplog.at_level(logging.WARNING): + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == [] + + assert ( + "Retrieved no language from database for meta_data object language" + in caplog.text + ) + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 0 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +@pytest.mark.parametrize( + "languages", + [[], None], +) +def test_update_document__works_with_no_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + languages: Optional[list[str]], +): + """Test that we can send a payload to the backend with no languages to assert that none are added.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": languages, + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == [] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + db_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(db_languages) == 0 + assert set([lang.language_id for lang in db_languages]) == set() + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +@pytest.mark.parametrize( + "existing_languages", + [[], ["eng"], ["aaa"], ["aaa", "aab"]], +) +def test_update_document__works_existing_languages( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + existing_languages: list[str], +): + """Test that we can send a payload to the backend with multiple languages to assert that both are added.""" + setup_docs_with_two_orgs_no_langs(data_db) + + for lang_code in existing_languages: + existing_doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + existing_lang = ( + data_db.query(Language).filter(Language.language_code == lang_code).one() + ) + existing_doc_lang = PhysicalDocumentLanguage( + language_id=existing_lang.id, + document_id=existing_doc.id, + source=LanguageSource.MODEL, + ) + data_db.add(existing_doc_lang) + data_db.flush() + data_db.commit() + + languages_to_add = ["eng", "fra"] + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": languages_to_add, + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + expected_languages = set(languages_to_add + existing_languages) + assert set(json_object["languages"]) == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == len(expected_languages) + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +def test_update_document__idempotent( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + assert response.status_code == 200 + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +def test_update_document__works_on_slug( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + slug = "DocSlug1" + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{slug}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + import_id = "CCLW.executive.1.2" + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +def test_update_document__status_422_when_not_found( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + "/api/v1/admin/documents/nothing", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py index c46bc4a3..8478fc50 100644 --- a/tests/non_search/routers/documents/test_document_families.py +++ b/tests/non_search/routers/documents/test_document_families.py @@ -1,999 +1,48 @@ -import logging -from typing import Optional - -import pytest -from db_client.models.dfce.family import FamilyDocument -from db_client.models.document.physical_document import ( - Language, - LanguageSource, - PhysicalDocumentLanguage, -) -from fastapi import status +from db_client.models.document.physical_document import PhysicalDocumentLanguage from fastapi.testclient import TestClient from sqlalchemy import update from sqlalchemy.orm import Session +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) from tests.non_search.setup_helpers import ( - setup_docs_with_two_orgs, - setup_docs_with_two_orgs_no_langs, setup_with_two_docs, - setup_with_two_docs_bad_ids, setup_with_two_docs_multiple_languages, - setup_with_two_unpublished_docs, ) -def test_physical_doc_languages( - data_client: TestClient, - data_db: Session, -): +def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): setup_with_two_docs(data_db) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert document["languages"] == ["eng"] - response = data_client.get( - "/api/v1/documents/DocSlug2", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") document = json_response["document"] - - assert response.status_code == 200 assert document["languages"] == [] def test_physical_doc_languages_not_visible( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs(data_db) data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert document["languages"] == [] def test_physical_doc_multiple_languages( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_multiple_languages(data_db) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert set(document["languages"]) == set(["fra", "eng"]) - - -def test_update_document_status__is_secure( - data_client: TestClient, - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed") - assert response.status_code == 401 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.12", - "UNFCCC.s.ill.y.2.2", - ], -) -def test_update_document_status__fails_on_non_matching_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_with_two_docs_bad_ids(data_db) - - response = data_client.post( - f"/api/v1/admin/documents/{import_id}/processed", - headers=data_superuser_token_headers, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - - -def test_update_document_status__publishes_document( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - """Test that we can send a payload to the backend to update family_document.document_status""" - setup_with_two_unpublished_docs(data_db) - UPDATE_IMPORT_ID = "CCLW.executive.1.2" - UNCHANGED_IMPORT_ID = "CCLW.executive.2.2" - - # State of db beforehand - pre_family_status = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) - .one() - .document_status - ) - - response = data_client.post( - f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed", - headers=data_superuser_token_headers, - ) - - assert response.status_code == 200 - json_object = response.json() - - assert json_object["import_id"] == UPDATE_IMPORT_ID - assert json_object["document_status"] == "Published" - - # Now Check the db - updated_family = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) - .one() - ) - assert updated_family.document_status == "Published" - assert updated_family.document_status != pre_family_status - - unchanged_family = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID) - .one() - ) - assert unchanged_family.document_status == "Deleted" - - -def test_update_document__is_secure( - data_client: TestClient, - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - payload = { - "md5sum": "abc123", - "content_type": "content_type", - "source_url": "source_url", - } - - response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload) - - assert response.status_code == 401 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.12", - "UNFCCC.s.ill.y.2.2", - ], -) -def test_update_document__fails_on_non_matching_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_with_two_docs_bad_ids(data_db) - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_docs_with_two_orgs(data_db) - - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_new_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["eng"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["eng"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "eng" - - # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["fra"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["eng", "fra"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 2 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_new_iso_639_1_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["el"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = set(["ell", "bod"]) - assert set(json_object["languages"]) == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 2 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -# TODO Parametrize this test with the two languages as parameters -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_existing_iso_639_1_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """ - Assert that we can skip over existing languages for documents when using two letter iso codes. - - Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a - language entry for that physical document we don't throw an error. This proves that we can detect that the - two-letter iso code language already exists. - """ - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bod"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["bod"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 1 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_existing_iso_639_3_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """ - Assert that we can skip over existing languages for documents when using three letter iso codes. - - Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a - language entry for that physical document we don't throw an error. This proves that we can detect that the - three-letter iso code language already exists. - """ - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bod"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["bod"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 1 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__logs_warning_on_four_letter_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - caplog, -): - """Send a payload to assert that languages that are too long aren't added and a warning is logged.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # Payload with a four letter language code that won't exist in the db - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["boda"], - } - - with caplog.at_level(logging.WARNING): - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == [] - - assert ( - "Retrieved no language from database for meta_data object language" - in caplog.text - ) - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 0 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -@pytest.mark.parametrize( - "languages", - [[], None], -) -def test_update_document__works_with_no_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - languages: Optional[list[str]], -): - """Test that we can send a payload to the backend with no languages to assert that none are added.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": languages, - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == [] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - db_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(db_languages) == 0 - assert set([lang.language_id for lang in db_languages]) == set() - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -@pytest.mark.parametrize( - "existing_languages", - [[], ["eng"], ["aaa"], ["aaa", "aab"]], -) -def test_update_document__works_existing_languages( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - existing_languages: list[str], -): - """Test that we can send a payload to the backend with multiple languages to assert that both are added.""" - setup_docs_with_two_orgs_no_langs(data_db) - - for lang_code in existing_languages: - existing_doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - existing_lang = ( - data_db.query(Language).filter(Language.language_code == lang_code).one() - ) - existing_doc_lang = PhysicalDocumentLanguage( - language_id=existing_lang.id, - document_id=existing_doc.id, - source=LanguageSource.MODEL, - ) - data_db.add(existing_doc_lang) - data_db.flush() - data_db.commit() - - languages_to_add = ["eng", "fra"] - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": languages_to_add, - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - expected_languages = set(languages_to_add + existing_languages) - assert set(json_object["languages"]) == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == len(expected_languages) - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -def test_update_document__idempotent( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - assert response.status_code == 200 - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -def test_update_document__works_on_slug( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - slug = "DocSlug1" - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{slug}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - import_id = "CCLW.executive.1.2" - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -def test_update_document__status_422_when_not_found( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - "/api/v1/admin/documents/nothing", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_document_families.py index 828ba603..e34f6051 100644 --- a/tests/non_search/routers/documents/test_get_document_families.py +++ b/tests/non_search/routers/documents/test_get_document_families.py @@ -1,9 +1,13 @@ import pytest from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent +from fastapi import status from fastapi.testclient import TestClient from sqlalchemy import update from sqlalchemy.orm import Session +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) from tests.non_search.setup_helpers import ( setup_with_docs, setup_with_two_docs, @@ -16,43 +20,41 @@ def test_documents_family_slug_returns_not_found( - data_db: Session, - data_client: TestClient, + data_db: Session, data_client: TestClient, valid_token ): setup_with_docs(data_db) assert data_db.query(Family).count() == 1 assert data_db.query(FamilyEvent).count() == 1 # Test by slug - response = data_client.get( - "/api/v1/documents/FamSlug100", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - assert response.status_code == 404 - assert response.json()["detail"] == "Nothing found for FamSlug100" + assert json_response["detail"] == "Nothing found for FamSlug100" def test_documents_family_slug_returns_correct_family( - data_db: Session, - data_client: TestClient, + data_db: Session, data_client: TestClient, valid_token ): setup_with_two_docs(data_db) # Test by slug - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - - json_response = response.json() - assert response.status_code == 200 assert json_response["import_id"] == "CCLW.family.1001.0" # Ensure a different family is returned - response = data_client.get( - "/api/v1/documents/FamSlug2", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug2", ) - - json_response = response.json() - assert response.status_code == 200 assert json_response["import_id"] == "CCLW.family.2002.0" @@ -124,17 +126,21 @@ def test_documents_family_slug_returns_correct_family( ], ) def test_documents_family_slug_returns_correct_json( - data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, ): setup_with_two_docs(data_db) # Test associations - response = data_client.get( - f"/api/v1/documents/{slug}", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, ) - json_response = response.json() - - assert response.status_code == 200 # Verify family data correct. assert len(json_response) == N_FAMILY_KEYS @@ -170,23 +176,20 @@ def test_documents_family_slug_returns_correct_json( def test_documents_family_slug_returns_multiple_docs( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - json_response = response.json() - - assert response.status_code == 200 assert len(json_response["documents"]) == 2 def test_documents_family_slug_returns_only_published_docs( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) data_db.execute( @@ -196,18 +199,16 @@ def test_documents_family_slug_returns_only_published_docs( ) # Test associations - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - json_response = response.json() - - assert response.status_code == 200 assert len(json_response["documents"]) == 1 def test_documents_family_slug_returns_404_when_all_docs_deleted( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) data_db.execute( @@ -222,29 +223,30 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( ) # Test associations - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - json_response = response.json() - - assert response.status_code == 404 assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" def test_documents_doc_slug_returns_not_found( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_docs(data_db) assert data_db.query(Family).count() == 1 assert data_db.query(FamilyEvent).count() == 1 # Test associations - response = data_client.get( - "/api/v1/documents/DocSlug100", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - assert response.status_code == 404 - assert response.json()["detail"] == "Nothing found for DocSlug100" + assert json_response["detail"] == "Nothing found for DocSlug100" @pytest.mark.parametrize( @@ -307,15 +309,20 @@ def test_documents_doc_slug_returns_not_found( ], ) def test_documents_doc_slug_preexisting_objects( - data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, ): setup_with_two_docs(data_db) - response = data_client.get( - f"/api/v1/documents/{slug}", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, ) - json_response = response.json() - assert response.status_code == 200 assert len(json_response) == 2 family = json_response["family"] @@ -330,8 +337,7 @@ def test_documents_doc_slug_preexisting_objects( def test_documents_doc_slug_when_deleted( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs(data_db) data_db.execute( @@ -339,9 +345,11 @@ def test_documents_doc_slug_when_deleted( .where(FamilyDocument.import_id == "CCLW.executive.2.2") .values(document_status="Deleted") ) - response = data_client.get( - "/api/v1/documents/DocSlug2", + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug2", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - json_response = response.json() - assert response.status_code == 404 assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" From 0539d7400bffbb04b7bf750e69a0c5e682a6374a Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:32:48 +0000 Subject: [PATCH 02/10] Driveby: Add CORS tests for MCF --- tests/non_search/routers/lookups/test_cors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/non_search/routers/lookups/test_cors.py b/tests/non_search/routers/lookups/test_cors.py index b7e15002..3a67e483 100644 --- a/tests/non_search/routers/lookups/test_cors.py +++ b/tests/non_search/routers/lookups/test_cors.py @@ -19,6 +19,13 @@ ("https://app.devclimatepolicyradar.com", False), # prefixed wrong domain ("https://app-climatepolicyradar.com", False), # prefixed wrong domain ("https://prefix-climate-laws.org", False), # climate laws prefixed domain + ("https://.climateprojectexplorer.org", False), # empty subdomain + ("https://prefix-climateprojectexplorer.org", False), # MCF prefixed domain + ("https://climateprojectexplorer.org", True), # base MCF URL + ( + "https://preview.climateprojectexplorer.org", + True, + ), # MCF subdomain URL ], ) def test_cors_regex(test_client, origin, should_be_allowed): From 4a164d8c3a7e50fc9dab5d25fa0fc1d26f83af93 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:45:13 +0000 Subject: [PATCH 03/10] Update slug lookup query to respect allowed corpora --- app/repository/document.py | 55 +++++++++++++++++++++++++----- app/repository/sql/slug_lookup.sql | 20 +++++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) create mode 100644 app/repository/sql/slug_lookup.sql diff --git a/app/repository/document.py b/app/repository/document.py index c97d9744..e5e53ed1 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -5,7 +5,9 @@ """ import logging +import os from datetime import datetime +from functools import lru_cache from typing import Optional, Sequence, cast from db_client.models.dfce.collection import Collection, CollectionFamily @@ -40,11 +42,38 @@ _LOGGER = logging.getLogger(__file__) +@lru_cache() +def _get_query_template(): + """Read query for non-deleted docs and their associated data.""" + with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file: + return file.read() + + +def get_slugged_object_from_allowed_corpora_query( + template_query, slug_name: str, allowed_corpora_ids: list[str] +) -> str: + """Create download whole database query, replacing variables. + + :param str ingest_cycle_start: The current ingest cycle date. + :param list[str] allowed_corpora_ids: The corpora from which we + should allow the data to be dumped. + :return str: The SQL query to perform on the database session. + """ + corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" + return template_query.replace("{slug_name}", slug_name).replace( # type: ignore + "{allowed_corpora_ids}", corpora_ids + ) # type: ignore + + def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: """Match the slug name to a FamilyDocument or Family import ID. + This function also contains logic to only get the import ID for the + family or document if the slug given is associated with a family + that belongs to the list of allowed corpora. + :param Session db: connection to db :param str slug: slug name to match :param Optional[list[str]] allowed_corpora: The corpora IDs to look @@ -52,18 +81,28 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( - Slug.name == slug - ) - - # # Only get slug for the fam/doc if it belongs to the list of allowed corpora. - # if allowed_corpora is not None: - # query = query.filter() + if allowed_corpora is not None: + query_template = _get_query_template() + query = get_slugged_object_from_allowed_corpora_query( + query_template, slug, allowed_corpora + ) + query = db.execute(query) + else: + query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( + Slug.name == slug + ) result = query.one_or_none() if result is None: return (None, None) - return result + + DOC_INDEX = 0 + doc_id = cast(str, result[DOC_INDEX]) if result[DOC_INDEX] is not None else None + + FAM_INDEX = 1 + fam_id = cast(str, result[FAM_INDEX]) if result[FAM_INDEX] is not None else None + + return doc_id, fam_id def get_family_document_and_context( diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql new file mode 100644 index 00000000..9d649067 --- /dev/null +++ b/app/repository/sql/slug_lookup.sql @@ -0,0 +1,20 @@ +SELECT + slug.family_document_import_id, slug.family_import_id +FROM slug +LEFT JOIN family ON family.import_id = slug.family_import_id +LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id +LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE slug.name = '{slug_name}' +AND corpus.import_id IN ({allowed_corpora_ids}) + +UNION + +SELECT + slug.family_document_import_id, slug.family_import_id +FROM slug +LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id +LEFT JOIN family ON family.import_id = family_document.family_import_id +LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id +LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE slug.name = '{slug_name}' +AND corpus.import_id IN ({allowed_corpora_ids}); From b43aabf8ec6f73426098cd41031297318b4448b1 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:45:38 +0000 Subject: [PATCH 04/10] Include actual CCLW corpus ID in test token --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index d897290d..02c2d85c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,7 +117,7 @@ def valid_token(monkeypatch): def mock_return(_, __, ___): return True - corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0" + corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000" subject = "CCLW" audience = "localhost" input_str = f"{corpora_ids};{subject};{audience}" From e2e2d5cbf10a4cc0ff382a62babbdce0b50741c7 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:46:12 +0000 Subject: [PATCH 05/10] Bump to 1.19.11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85e6896a..0a2c8f23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.10" +version = "1.19.11" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] From 45fa716fc11bac6c70c7a3f4fdac39d081370a97 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:50:52 +0000 Subject: [PATCH 06/10] Refactor _get_query_template --- app/repository/document.py | 13 ++++--------- app/repository/download.py | 8 -------- app/repository/helpers.py | 14 ++++++++++++++ app/repository/pipeline.py | 11 ++--------- app/service/download.py | 8 ++++++-- 5 files changed, 26 insertions(+), 28 deletions(-) create mode 100644 app/repository/helpers.py diff --git a/app/repository/document.py b/app/repository/document.py index e5e53ed1..57579a26 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -7,7 +7,6 @@ import logging import os from datetime import datetime -from functools import lru_cache from typing import Optional, Sequence, cast from db_client.models.dfce.collection import Collection, CollectionFamily @@ -36,19 +35,13 @@ LinkableFamily, ) from app.repository.geography import get_geo_subquery +from app.repository.helpers import get_query_template from app.repository.lookups import doc_type_from_family_document_metadata from app.service.util import to_cdn_url _LOGGER = logging.getLogger(__file__) -@lru_cache() -def _get_query_template(): - """Read query for non-deleted docs and their associated data.""" - with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file: - return file.read() - - def get_slugged_object_from_allowed_corpora_query( template_query, slug_name: str, allowed_corpora_ids: list[str] ) -> str: @@ -82,7 +75,9 @@ def get_slugged_objects( import id or the Family import_id. """ if allowed_corpora is not None: - query_template = _get_query_template() + query_template = get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) query = get_slugged_object_from_allowed_corpora_query( query_template, slug, allowed_corpora ) diff --git a/app/repository/download.py b/app/repository/download.py index 69938182..b0381e8d 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -1,7 +1,5 @@ """Functions to support browsing the RDS document structure""" -import os -from functools import lru_cache from logging import getLogger import pandas as pd @@ -12,12 +10,6 @@ _LOGGER = getLogger(__name__) -@lru_cache() -def _get_query_template(): - with open(os.path.join("app", "repository", "sql", "download.sql"), "r") as file: - return file.read() - - def get_whole_database_dump(query, db=Depends(get_db)): with db.connection() as conn: df = pd.read_sql(query, conn.connection) diff --git a/app/repository/helpers.py b/app/repository/helpers.py new file mode 100644 index 00000000..e976683b --- /dev/null +++ b/app/repository/helpers.py @@ -0,0 +1,14 @@ +""" +Functions to support the documents endpoints + +old functions (non DFC) are moved to the deprecated_documents.py file. +""" + +from functools import lru_cache + + +@lru_cache() +def get_query_template(filepath: str) -> str: + """Read query for non-deleted docs and their associated data.""" + with open(filepath, "r") as file: + return file.read() diff --git a/app/repository/pipeline.py b/app/repository/pipeline.py index 0eebaff4..d9eac447 100644 --- a/app/repository/pipeline.py +++ b/app/repository/pipeline.py @@ -1,7 +1,6 @@ import logging import os from datetime import datetime, timezone -from functools import lru_cache from typing import Sequence, cast import pandas as pd @@ -11,19 +10,13 @@ from app.clients.db.session import get_db from app.models.document import DocumentParserInput +from app.repository.helpers import get_query_template _LOGGER = logging.getLogger(__name__) MetadataType = dict[str, list[str]] -@lru_cache() -def generate_pipeline_ingest_input_query(): - """Read query for non-deleted docs and their associated data.""" - with open(os.path.join("app", "repository", "sql", "pipeline.sql"), "r") as file: - return file.read() - - def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame: """Get non-deleted docs and their associated data from the db. @@ -39,7 +32,7 @@ def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame: in database. """ _LOGGER.info("Running pipeline query") - query = generate_pipeline_ingest_input_query() + query = get_query_template(os.path.join("app", "repository", "sql", "pipeline.sql")) df = pd.read_sql(query, db.connection().connection) return df diff --git a/app/service/download.py b/app/service/download.py index dad6078d..189b70fb 100644 --- a/app/service/download.py +++ b/app/service/download.py @@ -1,5 +1,6 @@ """Functions to support browsing the RDS document structure""" +import os import zipfile from io import BytesIO, StringIO from logging import getLogger @@ -9,7 +10,8 @@ from fastapi import Depends from app.clients.db.session import get_db -from app.repository.download import _get_query_template, get_whole_database_dump +from app.repository.download import get_whole_database_dump +from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) @@ -90,7 +92,9 @@ def create_data_download_zip_archive( ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) ): readme_buffer = generate_data_dump_readme(ingest_cycle_start) - query_template = _get_query_template() + query_template = get_query_template( + os.path.join("app", "repository", "sql", "download.sql") + ) query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) csv_buffer = generate_data_dump_as_csv(query, db) From 6010fce97e0ace4aefead684f9d248d1a716d52f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:53:51 +0000 Subject: [PATCH 07/10] Refactor doc and fam lookup tests --- .../documents/test_document_families.py | 48 ----- .../routers/documents/test_get_document.py | 178 ++++++++++++++++++ ...ocument_families.py => test_get_family.py} | 124 ------------ 3 files changed, 178 insertions(+), 172 deletions(-) delete mode 100644 tests/non_search/routers/documents/test_document_families.py create mode 100644 tests/non_search/routers/documents/test_get_document.py rename tests/non_search/routers/documents/{test_get_document_families.py => test_get_family.py} (65%) diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py deleted file mode 100644 index 8478fc50..00000000 --- a/tests/non_search/routers/documents/test_document_families.py +++ /dev/null @@ -1,48 +0,0 @@ -from db_client.models.document.physical_document import PhysicalDocumentLanguage -from fastapi.testclient import TestClient -from sqlalchemy import update -from sqlalchemy.orm import Session - -from tests.non_search.routers.documents.setup_doc_fam_lookup import ( - _make_doc_fam_lookup_request, -) -from tests.non_search.setup_helpers import ( - setup_with_two_docs, - setup_with_two_docs_multiple_languages, -) - - -def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): - setup_with_two_docs(data_db) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert document["languages"] == ["eng"] - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") - document = json_response["document"] - assert document["languages"] == [] - - -def test_physical_doc_languages_not_visible( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs(data_db) - data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert document["languages"] == [] - - -def test_physical_doc_multiple_languages( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs_multiple_languages(data_db) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert set(document["languages"]) == set(["fra", "eng"]) diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py new file mode 100644 index 00000000..f7b0f665 --- /dev/null +++ b/tests/non_search/routers/documents/test_get_document.py @@ -0,0 +1,178 @@ +import pytest +from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent +from db_client.models.document.physical_document import PhysicalDocumentLanguage +from fastapi import status +from fastapi.testclient import TestClient +from sqlalchemy import update +from sqlalchemy.orm import Session + +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) +from tests.non_search.setup_helpers import ( + setup_with_docs, + setup_with_two_docs, + setup_with_two_docs_multiple_languages, +) + +N_FAMILY_OVERVIEW_KEYS = 8 +N_DOCUMENT_KEYS = 12 + + +def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): + setup_with_two_docs(data_db) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert document["languages"] == ["eng"] + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") + document = json_response["document"] + assert document["languages"] == [] + + +def test_physical_doc_languages_not_visible( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs(data_db) + data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert document["languages"] == [] + + +def test_physical_doc_multiple_languages( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs_multiple_languages(data_db) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert set(document["languages"]) == set(["fra", "eng"]) + + +def test_documents_doc_slug_returns_not_found( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_docs(data_db) + assert data_db.query(Family).count() == 1 + assert data_db.query(FamilyEvent).count() == 1 + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for DocSlug100" + + +@pytest.mark.parametrize( + ("slug", "expected_fam", "expected_doc"), + [ + ( + "DocSlug1", + { + "title": "Fam1", + "import_id": "CCLW.family.1001.0", + "geographies": ["South Asia"], + "category": "Executive", + "slug": "FamSlug1", + "corpus_id": "CCLW.corpus.i00000001.n0000", + "published_date": "2019-12-25T00:00:00Z", + "last_updated_date": "2019-12-25T00:00:00Z", + }, + { + "import_id": "CCLW.executive.1.2", + "variant": "Original Language", + "slug": "DocSlug1", + "title": "Document1", + "md5_sum": "111", + "cdn_object": None, + "content_type": "application/pdf", + "source_url": "http://somewhere1", + "language": "eng", + "languages": ["eng"], + "document_type": "Plan", + "document_role": "MAIN", + }, + ), + ( + "DocSlug2", + { + "title": "Fam2", + "import_id": "CCLW.family.2002.0", + "geographies": ["AFG", "IND"], + "category": "Executive", + "slug": "FamSlug2", + "corpus_id": "CCLW.corpus.i00000001.n0000", + "published_date": "2019-12-25T00:00:00Z", + "last_updated_date": "2019-12-25T00:00:00Z", + }, + { + "import_id": "CCLW.executive.2.2", + "variant": None, + "slug": "DocSlug2", + "title": "Document2", + "md5_sum": None, + "cdn_object": None, + "content_type": None, + "source_url": "http://another_somewhere", + "language": "", + "languages": [], + "document_type": "Order", + "document_role": "MAIN", + }, + ), + ], +) +def test_documents_doc_slug_preexisting_objects( + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, +): + setup_with_two_docs(data_db) + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, + ) + assert len(json_response) == 2 + + family = json_response["family"] + assert family + assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS + assert family == expected_fam + + doc = json_response["document"] + assert doc + assert len(doc) == N_DOCUMENT_KEYS + assert doc == expected_doc + + +def test_documents_doc_slug_when_deleted( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs(data_db) + data_db.execute( + update(FamilyDocument) + .where(FamilyDocument.import_id == "CCLW.executive.2.2") + .values(document_status="Deleted") + ) + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug2", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_family.py similarity index 65% rename from tests/non_search/routers/documents/test_get_document_families.py rename to tests/non_search/routers/documents/test_get_family.py index e34f6051..ecdc620f 100644 --- a/tests/non_search/routers/documents/test_get_document_families.py +++ b/tests/non_search/routers/documents/test_get_family.py @@ -15,7 +15,6 @@ ) N_FAMILY_KEYS = 15 -N_FAMILY_OVERVIEW_KEYS = 8 N_DOCUMENT_KEYS = 12 @@ -230,126 +229,3 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" - - -def test_documents_doc_slug_returns_not_found( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_docs(data_db) - assert data_db.query(Family).count() == 1 - assert data_db.query(FamilyEvent).count() == 1 - - # Test associations - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - "DocSlug100", - expected_status_code=status.HTTP_404_NOT_FOUND, - ) - assert json_response["detail"] == "Nothing found for DocSlug100" - - -@pytest.mark.parametrize( - ("slug", "expected_fam", "expected_doc"), - [ - ( - "DocSlug1", - { - "title": "Fam1", - "import_id": "CCLW.family.1001.0", - "geographies": ["South Asia"], - "category": "Executive", - "slug": "FamSlug1", - "corpus_id": "CCLW.corpus.i00000001.n0000", - "published_date": "2019-12-25T00:00:00Z", - "last_updated_date": "2019-12-25T00:00:00Z", - }, - { - "import_id": "CCLW.executive.1.2", - "variant": "Original Language", - "slug": "DocSlug1", - "title": "Document1", - "md5_sum": "111", - "cdn_object": None, - "content_type": "application/pdf", - "source_url": "http://somewhere1", - "language": "eng", - "languages": ["eng"], - "document_type": "Plan", - "document_role": "MAIN", - }, - ), - ( - "DocSlug2", - { - "title": "Fam2", - "import_id": "CCLW.family.2002.0", - "geographies": ["AFG", "IND"], - "category": "Executive", - "slug": "FamSlug2", - "corpus_id": "CCLW.corpus.i00000001.n0000", - "published_date": "2019-12-25T00:00:00Z", - "last_updated_date": "2019-12-25T00:00:00Z", - }, - { - "import_id": "CCLW.executive.2.2", - "variant": None, - "slug": "DocSlug2", - "title": "Document2", - "md5_sum": None, - "cdn_object": None, - "content_type": None, - "source_url": "http://another_somewhere", - "language": "", - "languages": [], - "document_type": "Order", - "document_role": "MAIN", - }, - ), - ], -) -def test_documents_doc_slug_preexisting_objects( - data_client: TestClient, - data_db: Session, - slug, - expected_fam, - expected_doc, - valid_token, -): - setup_with_two_docs(data_db) - - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - slug, - ) - assert len(json_response) == 2 - - family = json_response["family"] - assert family - assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS - assert family == expected_fam - - doc = json_response["document"] - assert doc - assert len(doc) == N_DOCUMENT_KEYS - assert doc == expected_doc - - -def test_documents_doc_slug_when_deleted( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs(data_db) - data_db.execute( - update(FamilyDocument) - .where(FamilyDocument.import_id == "CCLW.executive.2.2") - .values(document_status="Deleted") - ) - - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - "DocSlug2", - expected_status_code=status.HTTP_404_NOT_FOUND, - ) - assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" From 7fee18f2a90a9c00fce394e013ca431c7c9025a6 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:11:19 +0000 Subject: [PATCH 08/10] Add integration tests for doc/fam lookup when corpora mismatch --- .../routers/documents/test_get_document.py | 15 +++++++++++++++ .../routers/documents/test_get_family.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py index f7b0f665..dafa1642 100644 --- a/tests/non_search/routers/documents/test_get_document.py +++ b/tests/non_search/routers/documents/test_get_document.py @@ -176,3 +176,18 @@ def test_documents_doc_slug_when_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" + + +def test_documents_doc_slug_returns_404_when_corpora_mismatch( + data_client: TestClient, data_db: Session, alternative_token +): + setup_with_two_docs(data_db) + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + alternative_token, + "DocSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for DocSlug1" diff --git a/tests/non_search/routers/documents/test_get_family.py b/tests/non_search/routers/documents/test_get_family.py index ecdc620f..e149f2ca 100644 --- a/tests/non_search/routers/documents/test_get_family.py +++ b/tests/non_search/routers/documents/test_get_family.py @@ -229,3 +229,18 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" + + +def test_documents_family_slug_returns_404_when_corpora_mismatch( + data_client: TestClient, data_db: Session, alternative_token +): + setup_with_two_docs_one_family(data_db) + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + alternative_token, + "FamSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for FamSlug1" From 2a830dbe9e8d1b208f3592d9b55d9eef04f87e11 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:11:31 +0000 Subject: [PATCH 09/10] Add alternative corpora token --- tests/conftest.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 02c2d85c..6d6fb4cb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,6 +127,30 @@ def mock_return(_, __, ___): return af.create_configuration_token(input_str) +@pytest.fixture +def alternative_token(monkeypatch): + """Generate a valid alternative config token using TOKEN_SECRET_KEY. + + Need to generate the config token using the token secret key from + your local env file. For tests in CI, this will be the secret key in + the .env.example file, but for local development this secret key + might be different (e.g., the one for staging). This fixture works + around this. + """ + + def mock_return(_, __, ___): + return True + + corpora_ids = "UNFCCC.corpus.i00000001.n0000" + subject = "CPR" + audience = "localhost" + input_str = f"{corpora_ids};{subject};{audience}" + + af = AppTokenFactory() + monkeypatch.setattr(custom_app.AppTokenFactory, "validate", mock_return) + return af.create_configuration_token(input_str) + + @pytest.fixture def create_test_db(): """Create a test database and use it for the whole test session.""" From 024d60bbbbb266b57bcc5a48f4f2ac23e64808b0 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:21:14 +0000 Subject: [PATCH 10/10] Refactor download code --- app/repository/download.py | 29 ++++++++++++++++++++++++++++- app/service/download.py | 33 ++++++--------------------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/app/repository/download.py b/app/repository/download.py index b0381e8d..1ed90396 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -1,16 +1,43 @@ """Functions to support browsing the RDS document structure""" +import os from logging import getLogger import pandas as pd from fastapi import Depends from app.clients.db.session import get_db +from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) -def get_whole_database_dump(query, db=Depends(get_db)): +def create_query( + template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] +) -> str: + """Create download whole database query, replacing variables. + + :param str ingest_cycle_start: The current ingest cycle date. + :param list[str] allowed_corpora_ids: The corpora from which we + should allow the data to be dumped. + :return str: The SQL query to perform on the database session. + """ + corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" + return template_query.replace( # type: ignore + "{ingest_cycle_start}", ingest_cycle_start + ).replace( + "{allowed_corpora_ids}", corpora_ids + ) # type: ignore + + +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + query_template = get_query_template( + os.path.join("app", "repository", "sql", "download.sql") + ) + query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) + with db.connection() as conn: df = pd.read_sql(query, conn.connection) return df diff --git a/app/service/download.py b/app/service/download.py index 189b70fb..d5110b01 100644 --- a/app/service/download.py +++ b/app/service/download.py @@ -1,6 +1,5 @@ """Functions to support browsing the RDS document structure""" -import os import zipfile from io import BytesIO, StringIO from logging import getLogger @@ -11,29 +10,10 @@ from app.clients.db.session import get_db from app.repository.download import get_whole_database_dump -from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - def replace_slug_with_qualified_url( df: pd.DataFrame, public_app_url: str, @@ -63,8 +43,10 @@ def convert_dump_to_csv(df: pd.DataFrame): return csv_buffer -def generate_data_dump_as_csv(query, db=Depends(get_db)): - df = get_whole_database_dump(query, db) +def generate_data_dump_as_csv( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + df = get_whole_database_dump(ingest_cycle_start, allowed_corpora_ids, db) csv = convert_dump_to_csv(df) csv.seek(0) return csv @@ -92,11 +74,8 @@ def create_data_download_zip_archive( ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) ): readme_buffer = generate_data_dump_readme(ingest_cycle_start) - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") - ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) - csv_buffer = generate_data_dump_as_csv(query, db) + + csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, allowed_corpora_ids, db) zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file: