From 277e67430e3ec6c80ec07503ae997a9c0569b89f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:06:59 +0000 Subject: [PATCH 01/26] Make documents router dependent on app token --- app/api/api_v1/routers/documents.py | 20 +- app/api/api_v1/routers/search.py | 2 - app/repository/document.py | 27 +- .../routers/documents/setup_doc_fam_lookup.py | 24 + .../documents/test_admin_doc_routes.py | 936 +++++++++++++++++ .../documents/test_document_families.py | 973 +----------------- .../documents/test_get_document_families.py | 132 +-- 7 files changed, 1070 insertions(+), 1044 deletions(-) create mode 100644 tests/non_search/routers/documents/setup_doc_fam_lookup.py create mode 100644 tests/non_search/routers/documents/test_admin_doc_routes.py diff --git a/app/api/api_v1/routers/documents.py b/app/api/api_v1/routers/documents.py index 0cf92c19..1d48c278 100644 --- a/app/api/api_v1/routers/documents.py +++ b/app/api/api_v1/routers/documents.py @@ -1,8 +1,8 @@ import logging from http.client import NOT_FOUND -from typing import Union +from typing import Annotated, Union -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, Header, HTTPException, Request from app.clients.db.session import get_db from app.models.document import ( @@ -14,6 +14,7 @@ get_family_document_and_context, get_slugged_objects, ) +from app.service.custom_app import AppTokenFactory _LOGGER = logging.getLogger(__file__) @@ -28,20 +29,23 @@ ], ) async def family_or_document_detail( - slug: str, - db=Depends(get_db), + slug: str, request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ): """Get details of the family or document associated with the slug.""" _LOGGER.info( f"Getting detailed information for family or document '{slug}'", extra={ - "props": { - "import_id_or_slug": slug, - }, + "props": {"import_id_or_slug": slug, "app_token": str(app_token)}, }, ) - family_document_import_id, family_import_id = get_slugged_objects(db, slug) + # Decode the app token and validate it. + token = AppTokenFactory() + token.decode_and_validate(db, request, app_token) + + family_document_import_id, family_import_id = get_slugged_objects( + db, slug, token.allowed_corpora_ids + ) if family_document_import_id is None and family_import_id is None: raise HTTPException(status_code=NOT_FOUND, detail=f"Nothing found for {slug}") diff --git a/app/api/api_v1/routers/search.py b/app/api/api_v1/routers/search.py index cb5ebe87..a7749c33 100644 --- a/app/api/api_v1/routers/search.py +++ b/app/api/api_v1/routers/search.py @@ -277,8 +277,6 @@ def download_all_search_documents( request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ) -> RedirectResponse: """Download a CSV containing details of all the documents in the corpus.""" - token = AppTokenFactory() - _LOGGER.info( "Whole data download request", extra={ diff --git a/app/repository/document.py b/app/repository/document.py index 519735dd..c97d9744 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -40,20 +40,27 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_objects(db: Session, slug: str) -> tuple[Optional[str], Optional[str]]: - """ - Matches the slug name to a FamilyDocument or Family import_id +def get_slugged_objects( + db: Session, slug: str, allowed_corpora: Optional[list[str]] = None +) -> tuple[Optional[str], Optional[str]]: + """Match the slug name to a FamilyDocument or Family import ID. :param Session db: connection to db :param str slug: slug name to match - :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or - the Family import_id + :param Optional[list[str]] allowed_corpora: The corpora IDs to look + for the slugged object in. + :return tuple[Optional[str], Optional[str]]: the FamilyDocument + import id or the Family import_id. """ - result = ( - db.query(Slug.family_document_import_id, Slug.family_import_id).filter( - Slug.name == slug - ) - ).one_or_none() + query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( + Slug.name == slug + ) + + # # Only get slug for the fam/doc if it belongs to the list of allowed corpora. + # if allowed_corpora is not None: + # query = query.filter() + + result = query.one_or_none() if result is None: return (None, None) return result diff --git a/tests/non_search/routers/documents/setup_doc_fam_lookup.py b/tests/non_search/routers/documents/setup_doc_fam_lookup.py new file mode 100644 index 00000000..7530e88c --- /dev/null +++ b/tests/non_search/routers/documents/setup_doc_fam_lookup.py @@ -0,0 +1,24 @@ +from typing import Optional + +from fastapi import status + +DOCUMENTS_ENDPOINT = "/api/v1/documents" +TEST_HOST = "http://localhost:3000/" + + +def _make_doc_fam_lookup_request( + client, + token, + slug: str, + expected_status_code: int = status.HTTP_200_OK, + origin: Optional[str] = TEST_HOST, +): + headers = ( + {"app-token": token} + if origin is None + else {"app-token": token, "origin": origin} + ) + + response = client.get(f"{DOCUMENTS_ENDPOINT}/{slug}", headers=headers) + assert response.status_code == expected_status_code, response.text + return response.json() diff --git a/tests/non_search/routers/documents/test_admin_doc_routes.py b/tests/non_search/routers/documents/test_admin_doc_routes.py new file mode 100644 index 00000000..4674620c --- /dev/null +++ b/tests/non_search/routers/documents/test_admin_doc_routes.py @@ -0,0 +1,936 @@ +import logging +from typing import Optional + +import pytest +from db_client.models.dfce.family import FamilyDocument +from db_client.models.document.physical_document import ( + Language, + LanguageSource, + PhysicalDocumentLanguage, +) +from fastapi import status +from fastapi.testclient import TestClient +from sqlalchemy.orm import Session + +from tests.non_search.setup_helpers import ( + setup_docs_with_two_orgs, + setup_docs_with_two_orgs_no_langs, + setup_with_two_docs, + setup_with_two_docs_bad_ids, + setup_with_two_unpublished_docs, +) + + +def test_update_document_status__is_secure( + data_client: TestClient, + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed") + assert response.status_code == 401 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.12", + "UNFCCC.s.ill.y.2.2", + ], +) +def test_update_document_status__fails_on_non_matching_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_with_two_docs_bad_ids(data_db) + + response = data_client.post( + f"/api/v1/admin/documents/{import_id}/processed", + headers=data_superuser_token_headers, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +def test_update_document_status__publishes_document( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + """Test that we can send a payload to the backend to update family_document.document_status""" + setup_with_two_unpublished_docs(data_db) + UPDATE_IMPORT_ID = "CCLW.executive.1.2" + UNCHANGED_IMPORT_ID = "CCLW.executive.2.2" + + # State of db beforehand + pre_family_status = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) + .one() + .document_status + ) + + response = data_client.post( + f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed", + headers=data_superuser_token_headers, + ) + + assert response.status_code == 200 + json_object = response.json() + + assert json_object["import_id"] == UPDATE_IMPORT_ID + assert json_object["document_status"] == "Published" + + # Now Check the db + updated_family = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) + .one() + ) + assert updated_family.document_status == "Published" + assert updated_family.document_status != pre_family_status + + unchanged_family = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID) + .one() + ) + assert unchanged_family.document_status == "Deleted" + + +def test_update_document__is_secure( + data_client: TestClient, + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + payload = { + "md5sum": "abc123", + "content_type": "content_type", + "source_url": "source_url", + } + + response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload) + + assert response.status_code == 401 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.12", + "UNFCCC.s.ill.y.2.2", + ], +) +def test_update_document__fails_on_non_matching_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_with_two_docs_bad_ids(data_db) + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_import_id( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + setup_docs_with_two_orgs(data_db) + + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_new_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["eng"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["eng"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "eng" + + # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["fra"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["eng", "fra"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 2 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_new_iso_639_1_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["el"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = set(["ell", "bod"]) + assert set(json_object["languages"]) == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 2 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +# TODO Parametrize this test with the two languages as parameters +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_existing_iso_639_1_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """ + Assert that we can skip over existing languages for documents when using two letter iso codes. + + Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a + language entry for that physical document we don't throw an error. This proves that we can detect that the + two-letter iso code language already exists. + """ + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bod"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["bod"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 1 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__works_on_existing_iso_639_3_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, +): + """ + Assert that we can skip over existing languages for documents when using three letter iso codes. + + Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a + language entry for that physical document we don't throw an error. This proves that we can detect that the + three-letter iso code language already exists. + """ + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bo"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == ["bod"] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 1 + lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() + assert lang.language_code == "bod" + + # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["bod"], + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + expected_languages = ["bod"] + assert json_object["languages"] == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == 1 + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +def test_update_document__logs_warning_on_four_letter_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + caplog, +): + """Send a payload to assert that languages that are too long aren't added and a warning is logged.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # Payload with a four letter language code that won't exist in the db + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": ["boda"], + } + + with caplog.at_level(logging.WARNING): + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == [] + + assert ( + "Retrieved no language from database for meta_data object language" + in caplog.text + ) + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(languages) == 0 + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +@pytest.mark.parametrize( + "languages", + [[], None], +) +def test_update_document__works_with_no_language( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + languages: Optional[list[str]], +): + """Test that we can send a payload to the backend with no languages to assert that none are added.""" + setup_docs_with_two_orgs_no_langs(data_db) + + # ADD THE FIRST LANGUAGE + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": languages, + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + assert json_object["languages"] == [] + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + db_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(db_languages) == 0 + assert set([lang.language_id for lang in db_languages]) == set() + + +@pytest.mark.parametrize( + "import_id", + [ + "CCLW.executive.1.2", + "UNFCCC.non-party.2.2", + ], +) +@pytest.mark.parametrize( + "existing_languages", + [[], ["eng"], ["aaa"], ["aaa", "aab"]], +) +def test_update_document__works_existing_languages( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, + import_id: str, + existing_languages: list[str], +): + """Test that we can send a payload to the backend with multiple languages to assert that both are added.""" + setup_docs_with_two_orgs_no_langs(data_db) + + for lang_code in existing_languages: + existing_doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + existing_lang = ( + data_db.query(Language).filter(Language.language_code == lang_code).one() + ) + existing_doc_lang = PhysicalDocumentLanguage( + language_id=existing_lang.id, + document_id=existing_doc.id, + source=LanguageSource.MODEL, + ) + data_db.add(existing_doc_lang) + data_db.flush() + data_db.commit() + + languages_to_add = ["eng", "fra"] + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + "languages": languages_to_add, + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + expected_languages = set(languages_to_add + existing_languages) + assert set(json_object["languages"]) == expected_languages + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + doc_languages = ( + data_db.query(PhysicalDocumentLanguage) + .filter(PhysicalDocumentLanguage.document_id == doc.id) + .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) + .all() + ) + assert len(doc_languages) == len(expected_languages) + for doc_lang in doc_languages: + lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() + assert lang.language_code in expected_languages + + +def test_update_document__idempotent( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + import_id = "CCLW.executive.1.2" + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + assert response.status_code == 200 + + response = data_client.put( + f"/api/v1/admin/documents/{import_id}", + headers=data_superuser_token_headers, + json=payload, + ) + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +def test_update_document__works_on_slug( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + slug = "DocSlug1" + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + f"/api/v1/admin/documents/{slug}", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == 200 + json_object = response.json() + assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert json_object["content_type"] == "updated/content_type" + assert json_object["cdn_object"] == "folder/file" + + # Now Check the db + import_id = "CCLW.executive.1.2" + doc = ( + data_db.query(FamilyDocument) + .filter(FamilyDocument.import_id == import_id) + .one() + .physical_document + ) + assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" + assert doc.content_type == "updated/content_type" + assert doc.cdn_object == "folder/file" + + +def test_update_document__status_422_when_not_found( + data_client: TestClient, + data_superuser_token_headers: dict[str, str], + data_db: Session, +): + setup_with_two_docs(data_db) + + payload = { + "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", + "content_type": "updated/content_type", + "cdn_object": "folder/file", + } + + response = data_client.put( + "/api/v1/admin/documents/nothing", + headers=data_superuser_token_headers, + json=payload, + ) + + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py index c46bc4a3..8478fc50 100644 --- a/tests/non_search/routers/documents/test_document_families.py +++ b/tests/non_search/routers/documents/test_document_families.py @@ -1,999 +1,48 @@ -import logging -from typing import Optional - -import pytest -from db_client.models.dfce.family import FamilyDocument -from db_client.models.document.physical_document import ( - Language, - LanguageSource, - PhysicalDocumentLanguage, -) -from fastapi import status +from db_client.models.document.physical_document import PhysicalDocumentLanguage from fastapi.testclient import TestClient from sqlalchemy import update from sqlalchemy.orm import Session +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) from tests.non_search.setup_helpers import ( - setup_docs_with_two_orgs, - setup_docs_with_two_orgs_no_langs, setup_with_two_docs, - setup_with_two_docs_bad_ids, setup_with_two_docs_multiple_languages, - setup_with_two_unpublished_docs, ) -def test_physical_doc_languages( - data_client: TestClient, - data_db: Session, -): +def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): setup_with_two_docs(data_db) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert document["languages"] == ["eng"] - response = data_client.get( - "/api/v1/documents/DocSlug2", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") document = json_response["document"] - - assert response.status_code == 200 assert document["languages"] == [] def test_physical_doc_languages_not_visible( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs(data_db) data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert document["languages"] == [] def test_physical_doc_multiple_languages( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_multiple_languages(data_db) - response = data_client.get( - "/api/v1/documents/DocSlug1", - ) - json_response = response.json() + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") document = json_response["document"] - - assert response.status_code == 200 print(json_response) assert set(document["languages"]) == set(["fra", "eng"]) - - -def test_update_document_status__is_secure( - data_client: TestClient, - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed") - assert response.status_code == 401 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.12", - "UNFCCC.s.ill.y.2.2", - ], -) -def test_update_document_status__fails_on_non_matching_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_with_two_docs_bad_ids(data_db) - - response = data_client.post( - f"/api/v1/admin/documents/{import_id}/processed", - headers=data_superuser_token_headers, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - - -def test_update_document_status__publishes_document( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - """Test that we can send a payload to the backend to update family_document.document_status""" - setup_with_two_unpublished_docs(data_db) - UPDATE_IMPORT_ID = "CCLW.executive.1.2" - UNCHANGED_IMPORT_ID = "CCLW.executive.2.2" - - # State of db beforehand - pre_family_status = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) - .one() - .document_status - ) - - response = data_client.post( - f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed", - headers=data_superuser_token_headers, - ) - - assert response.status_code == 200 - json_object = response.json() - - assert json_object["import_id"] == UPDATE_IMPORT_ID - assert json_object["document_status"] == "Published" - - # Now Check the db - updated_family = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID) - .one() - ) - assert updated_family.document_status == "Published" - assert updated_family.document_status != pre_family_status - - unchanged_family = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID) - .one() - ) - assert unchanged_family.document_status == "Deleted" - - -def test_update_document__is_secure( - data_client: TestClient, - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - payload = { - "md5sum": "abc123", - "content_type": "content_type", - "source_url": "source_url", - } - - response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload) - - assert response.status_code == 401 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.12", - "UNFCCC.s.ill.y.2.2", - ], -) -def test_update_document__fails_on_non_matching_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_with_two_docs_bad_ids(data_db) - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_import_id( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - setup_docs_with_two_orgs(data_db) - - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_new_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["eng"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["eng"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "eng" - - # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["fra"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["eng", "fra"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 2 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_new_iso_639_1_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """Send two payloads in series to assert that languages are additive and we don't remove existing languages.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["el"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = set(["ell", "bod"]) - assert set(json_object["languages"]) == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 2 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -# TODO Parametrize this test with the two languages as parameters -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_existing_iso_639_1_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """ - Assert that we can skip over existing languages for documents when using two letter iso codes. - - Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a - language entry for that physical document we don't throw an error. This proves that we can detect that the - two-letter iso code language already exists. - """ - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bod"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["bod"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 1 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__works_on_existing_iso_639_3_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, -): - """ - Assert that we can skip over existing languages for documents when using three letter iso codes. - - Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a - language entry for that physical document we don't throw an error. This proves that we can detect that the - three-letter iso code language already exists. - """ - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bo"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == ["bod"] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 1 - lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one() - assert lang.language_code == "bod" - - # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["bod"], - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - expected_languages = ["bod"] - assert json_object["languages"] == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == 1 - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -def test_update_document__logs_warning_on_four_letter_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - caplog, -): - """Send a payload to assert that languages that are too long aren't added and a warning is logged.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # Payload with a four letter language code that won't exist in the db - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": ["boda"], - } - - with caplog.at_level(logging.WARNING): - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == [] - - assert ( - "Retrieved no language from database for meta_data object language" - in caplog.text - ) - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(languages) == 0 - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -@pytest.mark.parametrize( - "languages", - [[], None], -) -def test_update_document__works_with_no_language( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - languages: Optional[list[str]], -): - """Test that we can send a payload to the backend with no languages to assert that none are added.""" - setup_docs_with_two_orgs_no_langs(data_db) - - # ADD THE FIRST LANGUAGE - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": languages, - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - assert json_object["languages"] == [] - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - db_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(db_languages) == 0 - assert set([lang.language_id for lang in db_languages]) == set() - - -@pytest.mark.parametrize( - "import_id", - [ - "CCLW.executive.1.2", - "UNFCCC.non-party.2.2", - ], -) -@pytest.mark.parametrize( - "existing_languages", - [[], ["eng"], ["aaa"], ["aaa", "aab"]], -) -def test_update_document__works_existing_languages( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, - import_id: str, - existing_languages: list[str], -): - """Test that we can send a payload to the backend with multiple languages to assert that both are added.""" - setup_docs_with_two_orgs_no_langs(data_db) - - for lang_code in existing_languages: - existing_doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - existing_lang = ( - data_db.query(Language).filter(Language.language_code == lang_code).one() - ) - existing_doc_lang = PhysicalDocumentLanguage( - language_id=existing_lang.id, - document_id=existing_doc.id, - source=LanguageSource.MODEL, - ) - data_db.add(existing_doc_lang) - data_db.flush() - data_db.commit() - - languages_to_add = ["eng", "fra"] - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - "languages": languages_to_add, - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - expected_languages = set(languages_to_add + existing_languages) - assert set(json_object["languages"]) == expected_languages - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - doc_languages = ( - data_db.query(PhysicalDocumentLanguage) - .filter(PhysicalDocumentLanguage.document_id == doc.id) - .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL) - .all() - ) - assert len(doc_languages) == len(expected_languages) - for doc_lang in doc_languages: - lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one() - assert lang.language_code in expected_languages - - -def test_update_document__idempotent( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - import_id = "CCLW.executive.1.2" - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - assert response.status_code == 200 - - response = data_client.put( - f"/api/v1/admin/documents/{import_id}", - headers=data_superuser_token_headers, - json=payload, - ) - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -def test_update_document__works_on_slug( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - slug = "DocSlug1" - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - f"/api/v1/admin/documents/{slug}", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == 200 - json_object = response.json() - assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert json_object["content_type"] == "updated/content_type" - assert json_object["cdn_object"] == "folder/file" - - # Now Check the db - import_id = "CCLW.executive.1.2" - doc = ( - data_db.query(FamilyDocument) - .filter(FamilyDocument.import_id == import_id) - .one() - .physical_document - ) - assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76" - assert doc.content_type == "updated/content_type" - assert doc.cdn_object == "folder/file" - - -def test_update_document__status_422_when_not_found( - data_client: TestClient, - data_superuser_token_headers: dict[str, str], - data_db: Session, -): - setup_with_two_docs(data_db) - - payload = { - "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76", - "content_type": "updated/content_type", - "cdn_object": "folder/file", - } - - response = data_client.put( - "/api/v1/admin/documents/nothing", - headers=data_superuser_token_headers, - json=payload, - ) - - assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_document_families.py index 828ba603..e34f6051 100644 --- a/tests/non_search/routers/documents/test_get_document_families.py +++ b/tests/non_search/routers/documents/test_get_document_families.py @@ -1,9 +1,13 @@ import pytest from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent +from fastapi import status from fastapi.testclient import TestClient from sqlalchemy import update from sqlalchemy.orm import Session +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) from tests.non_search.setup_helpers import ( setup_with_docs, setup_with_two_docs, @@ -16,43 +20,41 @@ def test_documents_family_slug_returns_not_found( - data_db: Session, - data_client: TestClient, + data_db: Session, data_client: TestClient, valid_token ): setup_with_docs(data_db) assert data_db.query(Family).count() == 1 assert data_db.query(FamilyEvent).count() == 1 # Test by slug - response = data_client.get( - "/api/v1/documents/FamSlug100", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - assert response.status_code == 404 - assert response.json()["detail"] == "Nothing found for FamSlug100" + assert json_response["detail"] == "Nothing found for FamSlug100" def test_documents_family_slug_returns_correct_family( - data_db: Session, - data_client: TestClient, + data_db: Session, data_client: TestClient, valid_token ): setup_with_two_docs(data_db) # Test by slug - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - - json_response = response.json() - assert response.status_code == 200 assert json_response["import_id"] == "CCLW.family.1001.0" # Ensure a different family is returned - response = data_client.get( - "/api/v1/documents/FamSlug2", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug2", ) - - json_response = response.json() - assert response.status_code == 200 assert json_response["import_id"] == "CCLW.family.2002.0" @@ -124,17 +126,21 @@ def test_documents_family_slug_returns_correct_family( ], ) def test_documents_family_slug_returns_correct_json( - data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, ): setup_with_two_docs(data_db) # Test associations - response = data_client.get( - f"/api/v1/documents/{slug}", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, ) - json_response = response.json() - - assert response.status_code == 200 # Verify family data correct. assert len(json_response) == N_FAMILY_KEYS @@ -170,23 +176,20 @@ def test_documents_family_slug_returns_correct_json( def test_documents_family_slug_returns_multiple_docs( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - json_response = response.json() - - assert response.status_code == 200 assert len(json_response["documents"]) == 2 def test_documents_family_slug_returns_only_published_docs( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) data_db.execute( @@ -196,18 +199,16 @@ def test_documents_family_slug_returns_only_published_docs( ) # Test associations - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", ) - json_response = response.json() - - assert response.status_code == 200 assert len(json_response["documents"]) == 1 def test_documents_family_slug_returns_404_when_all_docs_deleted( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs_one_family(data_db) data_db.execute( @@ -222,29 +223,30 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( ) # Test associations - response = data_client.get( - "/api/v1/documents/FamSlug1", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "FamSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - json_response = response.json() - - assert response.status_code == 404 assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" def test_documents_doc_slug_returns_not_found( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_docs(data_db) assert data_db.query(Family).count() == 1 assert data_db.query(FamilyEvent).count() == 1 # Test associations - response = data_client.get( - "/api/v1/documents/DocSlug100", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - assert response.status_code == 404 - assert response.json()["detail"] == "Nothing found for DocSlug100" + assert json_response["detail"] == "Nothing found for DocSlug100" @pytest.mark.parametrize( @@ -307,15 +309,20 @@ def test_documents_doc_slug_returns_not_found( ], ) def test_documents_doc_slug_preexisting_objects( - data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, ): setup_with_two_docs(data_db) - response = data_client.get( - f"/api/v1/documents/{slug}", + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, ) - json_response = response.json() - assert response.status_code == 200 assert len(json_response) == 2 family = json_response["family"] @@ -330,8 +337,7 @@ def test_documents_doc_slug_preexisting_objects( def test_documents_doc_slug_when_deleted( - data_client: TestClient, - data_db: Session, + data_client: TestClient, data_db: Session, valid_token ): setup_with_two_docs(data_db) data_db.execute( @@ -339,9 +345,11 @@ def test_documents_doc_slug_when_deleted( .where(FamilyDocument.import_id == "CCLW.executive.2.2") .values(document_status="Deleted") ) - response = data_client.get( - "/api/v1/documents/DocSlug2", + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug2", + expected_status_code=status.HTTP_404_NOT_FOUND, ) - json_response = response.json() - assert response.status_code == 404 assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" From 0539d7400bffbb04b7bf750e69a0c5e682a6374a Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:32:48 +0000 Subject: [PATCH 02/26] Driveby: Add CORS tests for MCF --- tests/non_search/routers/lookups/test_cors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/non_search/routers/lookups/test_cors.py b/tests/non_search/routers/lookups/test_cors.py index b7e15002..3a67e483 100644 --- a/tests/non_search/routers/lookups/test_cors.py +++ b/tests/non_search/routers/lookups/test_cors.py @@ -19,6 +19,13 @@ ("https://app.devclimatepolicyradar.com", False), # prefixed wrong domain ("https://app-climatepolicyradar.com", False), # prefixed wrong domain ("https://prefix-climate-laws.org", False), # climate laws prefixed domain + ("https://.climateprojectexplorer.org", False), # empty subdomain + ("https://prefix-climateprojectexplorer.org", False), # MCF prefixed domain + ("https://climateprojectexplorer.org", True), # base MCF URL + ( + "https://preview.climateprojectexplorer.org", + True, + ), # MCF subdomain URL ], ) def test_cors_regex(test_client, origin, should_be_allowed): From 4a164d8c3a7e50fc9dab5d25fa0fc1d26f83af93 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:45:13 +0000 Subject: [PATCH 03/26] Update slug lookup query to respect allowed corpora --- app/repository/document.py | 55 +++++++++++++++++++++++++----- app/repository/sql/slug_lookup.sql | 20 +++++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) create mode 100644 app/repository/sql/slug_lookup.sql diff --git a/app/repository/document.py b/app/repository/document.py index c97d9744..e5e53ed1 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -5,7 +5,9 @@ """ import logging +import os from datetime import datetime +from functools import lru_cache from typing import Optional, Sequence, cast from db_client.models.dfce.collection import Collection, CollectionFamily @@ -40,11 +42,38 @@ _LOGGER = logging.getLogger(__file__) +@lru_cache() +def _get_query_template(): + """Read query for non-deleted docs and their associated data.""" + with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file: + return file.read() + + +def get_slugged_object_from_allowed_corpora_query( + template_query, slug_name: str, allowed_corpora_ids: list[str] +) -> str: + """Create download whole database query, replacing variables. + + :param str ingest_cycle_start: The current ingest cycle date. + :param list[str] allowed_corpora_ids: The corpora from which we + should allow the data to be dumped. + :return str: The SQL query to perform on the database session. + """ + corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" + return template_query.replace("{slug_name}", slug_name).replace( # type: ignore + "{allowed_corpora_ids}", corpora_ids + ) # type: ignore + + def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: """Match the slug name to a FamilyDocument or Family import ID. + This function also contains logic to only get the import ID for the + family or document if the slug given is associated with a family + that belongs to the list of allowed corpora. + :param Session db: connection to db :param str slug: slug name to match :param Optional[list[str]] allowed_corpora: The corpora IDs to look @@ -52,18 +81,28 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( - Slug.name == slug - ) - - # # Only get slug for the fam/doc if it belongs to the list of allowed corpora. - # if allowed_corpora is not None: - # query = query.filter() + if allowed_corpora is not None: + query_template = _get_query_template() + query = get_slugged_object_from_allowed_corpora_query( + query_template, slug, allowed_corpora + ) + query = db.execute(query) + else: + query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( + Slug.name == slug + ) result = query.one_or_none() if result is None: return (None, None) - return result + + DOC_INDEX = 0 + doc_id = cast(str, result[DOC_INDEX]) if result[DOC_INDEX] is not None else None + + FAM_INDEX = 1 + fam_id = cast(str, result[FAM_INDEX]) if result[FAM_INDEX] is not None else None + + return doc_id, fam_id def get_family_document_and_context( diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql new file mode 100644 index 00000000..9d649067 --- /dev/null +++ b/app/repository/sql/slug_lookup.sql @@ -0,0 +1,20 @@ +SELECT + slug.family_document_import_id, slug.family_import_id +FROM slug +LEFT JOIN family ON family.import_id = slug.family_import_id +LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id +LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE slug.name = '{slug_name}' +AND corpus.import_id IN ({allowed_corpora_ids}) + +UNION + +SELECT + slug.family_document_import_id, slug.family_import_id +FROM slug +LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id +LEFT JOIN family ON family.import_id = family_document.family_import_id +LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id +LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE slug.name = '{slug_name}' +AND corpus.import_id IN ({allowed_corpora_ids}); From b43aabf8ec6f73426098cd41031297318b4448b1 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:45:38 +0000 Subject: [PATCH 04/26] Include actual CCLW corpus ID in test token --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index d897290d..02c2d85c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,7 +117,7 @@ def valid_token(monkeypatch): def mock_return(_, __, ___): return True - corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0" + corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000" subject = "CCLW" audience = "localhost" input_str = f"{corpora_ids};{subject};{audience}" From e2e2d5cbf10a4cc0ff382a62babbdce0b50741c7 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:46:12 +0000 Subject: [PATCH 05/26] Bump to 1.19.11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85e6896a..0a2c8f23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.10" +version = "1.19.11" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] From 45fa716fc11bac6c70c7a3f4fdac39d081370a97 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:50:52 +0000 Subject: [PATCH 06/26] Refactor _get_query_template --- app/repository/document.py | 13 ++++--------- app/repository/download.py | 8 -------- app/repository/helpers.py | 14 ++++++++++++++ app/repository/pipeline.py | 11 ++--------- app/service/download.py | 8 ++++++-- 5 files changed, 26 insertions(+), 28 deletions(-) create mode 100644 app/repository/helpers.py diff --git a/app/repository/document.py b/app/repository/document.py index e5e53ed1..57579a26 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -7,7 +7,6 @@ import logging import os from datetime import datetime -from functools import lru_cache from typing import Optional, Sequence, cast from db_client.models.dfce.collection import Collection, CollectionFamily @@ -36,19 +35,13 @@ LinkableFamily, ) from app.repository.geography import get_geo_subquery +from app.repository.helpers import get_query_template from app.repository.lookups import doc_type_from_family_document_metadata from app.service.util import to_cdn_url _LOGGER = logging.getLogger(__file__) -@lru_cache() -def _get_query_template(): - """Read query for non-deleted docs and their associated data.""" - with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file: - return file.read() - - def get_slugged_object_from_allowed_corpora_query( template_query, slug_name: str, allowed_corpora_ids: list[str] ) -> str: @@ -82,7 +75,9 @@ def get_slugged_objects( import id or the Family import_id. """ if allowed_corpora is not None: - query_template = _get_query_template() + query_template = get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) query = get_slugged_object_from_allowed_corpora_query( query_template, slug, allowed_corpora ) diff --git a/app/repository/download.py b/app/repository/download.py index 69938182..b0381e8d 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -1,7 +1,5 @@ """Functions to support browsing the RDS document structure""" -import os -from functools import lru_cache from logging import getLogger import pandas as pd @@ -12,12 +10,6 @@ _LOGGER = getLogger(__name__) -@lru_cache() -def _get_query_template(): - with open(os.path.join("app", "repository", "sql", "download.sql"), "r") as file: - return file.read() - - def get_whole_database_dump(query, db=Depends(get_db)): with db.connection() as conn: df = pd.read_sql(query, conn.connection) diff --git a/app/repository/helpers.py b/app/repository/helpers.py new file mode 100644 index 00000000..e976683b --- /dev/null +++ b/app/repository/helpers.py @@ -0,0 +1,14 @@ +""" +Functions to support the documents endpoints + +old functions (non DFC) are moved to the deprecated_documents.py file. +""" + +from functools import lru_cache + + +@lru_cache() +def get_query_template(filepath: str) -> str: + """Read query for non-deleted docs and their associated data.""" + with open(filepath, "r") as file: + return file.read() diff --git a/app/repository/pipeline.py b/app/repository/pipeline.py index 0eebaff4..d9eac447 100644 --- a/app/repository/pipeline.py +++ b/app/repository/pipeline.py @@ -1,7 +1,6 @@ import logging import os from datetime import datetime, timezone -from functools import lru_cache from typing import Sequence, cast import pandas as pd @@ -11,19 +10,13 @@ from app.clients.db.session import get_db from app.models.document import DocumentParserInput +from app.repository.helpers import get_query_template _LOGGER = logging.getLogger(__name__) MetadataType = dict[str, list[str]] -@lru_cache() -def generate_pipeline_ingest_input_query(): - """Read query for non-deleted docs and their associated data.""" - with open(os.path.join("app", "repository", "sql", "pipeline.sql"), "r") as file: - return file.read() - - def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame: """Get non-deleted docs and their associated data from the db. @@ -39,7 +32,7 @@ def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame: in database. """ _LOGGER.info("Running pipeline query") - query = generate_pipeline_ingest_input_query() + query = get_query_template(os.path.join("app", "repository", "sql", "pipeline.sql")) df = pd.read_sql(query, db.connection().connection) return df diff --git a/app/service/download.py b/app/service/download.py index dad6078d..189b70fb 100644 --- a/app/service/download.py +++ b/app/service/download.py @@ -1,5 +1,6 @@ """Functions to support browsing the RDS document structure""" +import os import zipfile from io import BytesIO, StringIO from logging import getLogger @@ -9,7 +10,8 @@ from fastapi import Depends from app.clients.db.session import get_db -from app.repository.download import _get_query_template, get_whole_database_dump +from app.repository.download import get_whole_database_dump +from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) @@ -90,7 +92,9 @@ def create_data_download_zip_archive( ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) ): readme_buffer = generate_data_dump_readme(ingest_cycle_start) - query_template = _get_query_template() + query_template = get_query_template( + os.path.join("app", "repository", "sql", "download.sql") + ) query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) csv_buffer = generate_data_dump_as_csv(query, db) From 6010fce97e0ace4aefead684f9d248d1a716d52f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:53:51 +0000 Subject: [PATCH 07/26] Refactor doc and fam lookup tests --- .../documents/test_document_families.py | 48 ----- .../routers/documents/test_get_document.py | 178 ++++++++++++++++++ ...ocument_families.py => test_get_family.py} | 124 ------------ 3 files changed, 178 insertions(+), 172 deletions(-) delete mode 100644 tests/non_search/routers/documents/test_document_families.py create mode 100644 tests/non_search/routers/documents/test_get_document.py rename tests/non_search/routers/documents/{test_get_document_families.py => test_get_family.py} (65%) diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py deleted file mode 100644 index 8478fc50..00000000 --- a/tests/non_search/routers/documents/test_document_families.py +++ /dev/null @@ -1,48 +0,0 @@ -from db_client.models.document.physical_document import PhysicalDocumentLanguage -from fastapi.testclient import TestClient -from sqlalchemy import update -from sqlalchemy.orm import Session - -from tests.non_search.routers.documents.setup_doc_fam_lookup import ( - _make_doc_fam_lookup_request, -) -from tests.non_search.setup_helpers import ( - setup_with_two_docs, - setup_with_two_docs_multiple_languages, -) - - -def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): - setup_with_two_docs(data_db) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert document["languages"] == ["eng"] - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") - document = json_response["document"] - assert document["languages"] == [] - - -def test_physical_doc_languages_not_visible( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs(data_db) - data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert document["languages"] == [] - - -def test_physical_doc_multiple_languages( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs_multiple_languages(data_db) - - json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") - document = json_response["document"] - print(json_response) - assert set(document["languages"]) == set(["fra", "eng"]) diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py new file mode 100644 index 00000000..f7b0f665 --- /dev/null +++ b/tests/non_search/routers/documents/test_get_document.py @@ -0,0 +1,178 @@ +import pytest +from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent +from db_client.models.document.physical_document import PhysicalDocumentLanguage +from fastapi import status +from fastapi.testclient import TestClient +from sqlalchemy import update +from sqlalchemy.orm import Session + +from tests.non_search.routers.documents.setup_doc_fam_lookup import ( + _make_doc_fam_lookup_request, +) +from tests.non_search.setup_helpers import ( + setup_with_docs, + setup_with_two_docs, + setup_with_two_docs_multiple_languages, +) + +N_FAMILY_OVERVIEW_KEYS = 8 +N_DOCUMENT_KEYS = 12 + + +def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token): + setup_with_two_docs(data_db) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert document["languages"] == ["eng"] + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2") + document = json_response["document"] + assert document["languages"] == [] + + +def test_physical_doc_languages_not_visible( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs(data_db) + data_db.execute(update(PhysicalDocumentLanguage).values(visible=False)) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert document["languages"] == [] + + +def test_physical_doc_multiple_languages( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs_multiple_languages(data_db) + + json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1") + document = json_response["document"] + print(json_response) + assert set(document["languages"]) == set(["fra", "eng"]) + + +def test_documents_doc_slug_returns_not_found( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_docs(data_db) + assert data_db.query(Family).count() == 1 + assert data_db.query(FamilyEvent).count() == 1 + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug100", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for DocSlug100" + + +@pytest.mark.parametrize( + ("slug", "expected_fam", "expected_doc"), + [ + ( + "DocSlug1", + { + "title": "Fam1", + "import_id": "CCLW.family.1001.0", + "geographies": ["South Asia"], + "category": "Executive", + "slug": "FamSlug1", + "corpus_id": "CCLW.corpus.i00000001.n0000", + "published_date": "2019-12-25T00:00:00Z", + "last_updated_date": "2019-12-25T00:00:00Z", + }, + { + "import_id": "CCLW.executive.1.2", + "variant": "Original Language", + "slug": "DocSlug1", + "title": "Document1", + "md5_sum": "111", + "cdn_object": None, + "content_type": "application/pdf", + "source_url": "http://somewhere1", + "language": "eng", + "languages": ["eng"], + "document_type": "Plan", + "document_role": "MAIN", + }, + ), + ( + "DocSlug2", + { + "title": "Fam2", + "import_id": "CCLW.family.2002.0", + "geographies": ["AFG", "IND"], + "category": "Executive", + "slug": "FamSlug2", + "corpus_id": "CCLW.corpus.i00000001.n0000", + "published_date": "2019-12-25T00:00:00Z", + "last_updated_date": "2019-12-25T00:00:00Z", + }, + { + "import_id": "CCLW.executive.2.2", + "variant": None, + "slug": "DocSlug2", + "title": "Document2", + "md5_sum": None, + "cdn_object": None, + "content_type": None, + "source_url": "http://another_somewhere", + "language": "", + "languages": [], + "document_type": "Order", + "document_role": "MAIN", + }, + ), + ], +) +def test_documents_doc_slug_preexisting_objects( + data_client: TestClient, + data_db: Session, + slug, + expected_fam, + expected_doc, + valid_token, +): + setup_with_two_docs(data_db) + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + slug, + ) + assert len(json_response) == 2 + + family = json_response["family"] + assert family + assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS + assert family == expected_fam + + doc = json_response["document"] + assert doc + assert len(doc) == N_DOCUMENT_KEYS + assert doc == expected_doc + + +def test_documents_doc_slug_when_deleted( + data_client: TestClient, data_db: Session, valid_token +): + setup_with_two_docs(data_db) + data_db.execute( + update(FamilyDocument) + .where(FamilyDocument.import_id == "CCLW.executive.2.2") + .values(document_status="Deleted") + ) + + json_response = _make_doc_fam_lookup_request( + data_client, + valid_token, + "DocSlug2", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_family.py similarity index 65% rename from tests/non_search/routers/documents/test_get_document_families.py rename to tests/non_search/routers/documents/test_get_family.py index e34f6051..ecdc620f 100644 --- a/tests/non_search/routers/documents/test_get_document_families.py +++ b/tests/non_search/routers/documents/test_get_family.py @@ -15,7 +15,6 @@ ) N_FAMILY_KEYS = 15 -N_FAMILY_OVERVIEW_KEYS = 8 N_DOCUMENT_KEYS = 12 @@ -230,126 +229,3 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" - - -def test_documents_doc_slug_returns_not_found( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_docs(data_db) - assert data_db.query(Family).count() == 1 - assert data_db.query(FamilyEvent).count() == 1 - - # Test associations - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - "DocSlug100", - expected_status_code=status.HTTP_404_NOT_FOUND, - ) - assert json_response["detail"] == "Nothing found for DocSlug100" - - -@pytest.mark.parametrize( - ("slug", "expected_fam", "expected_doc"), - [ - ( - "DocSlug1", - { - "title": "Fam1", - "import_id": "CCLW.family.1001.0", - "geographies": ["South Asia"], - "category": "Executive", - "slug": "FamSlug1", - "corpus_id": "CCLW.corpus.i00000001.n0000", - "published_date": "2019-12-25T00:00:00Z", - "last_updated_date": "2019-12-25T00:00:00Z", - }, - { - "import_id": "CCLW.executive.1.2", - "variant": "Original Language", - "slug": "DocSlug1", - "title": "Document1", - "md5_sum": "111", - "cdn_object": None, - "content_type": "application/pdf", - "source_url": "http://somewhere1", - "language": "eng", - "languages": ["eng"], - "document_type": "Plan", - "document_role": "MAIN", - }, - ), - ( - "DocSlug2", - { - "title": "Fam2", - "import_id": "CCLW.family.2002.0", - "geographies": ["AFG", "IND"], - "category": "Executive", - "slug": "FamSlug2", - "corpus_id": "CCLW.corpus.i00000001.n0000", - "published_date": "2019-12-25T00:00:00Z", - "last_updated_date": "2019-12-25T00:00:00Z", - }, - { - "import_id": "CCLW.executive.2.2", - "variant": None, - "slug": "DocSlug2", - "title": "Document2", - "md5_sum": None, - "cdn_object": None, - "content_type": None, - "source_url": "http://another_somewhere", - "language": "", - "languages": [], - "document_type": "Order", - "document_role": "MAIN", - }, - ), - ], -) -def test_documents_doc_slug_preexisting_objects( - data_client: TestClient, - data_db: Session, - slug, - expected_fam, - expected_doc, - valid_token, -): - setup_with_two_docs(data_db) - - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - slug, - ) - assert len(json_response) == 2 - - family = json_response["family"] - assert family - assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS - assert family == expected_fam - - doc = json_response["document"] - assert doc - assert len(doc) == N_DOCUMENT_KEYS - assert doc == expected_doc - - -def test_documents_doc_slug_when_deleted( - data_client: TestClient, data_db: Session, valid_token -): - setup_with_two_docs(data_db) - data_db.execute( - update(FamilyDocument) - .where(FamilyDocument.import_id == "CCLW.executive.2.2") - .values(document_status="Deleted") - ) - - json_response = _make_doc_fam_lookup_request( - data_client, - valid_token, - "DocSlug2", - expected_status_code=status.HTTP_404_NOT_FOUND, - ) - assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" From 7fee18f2a90a9c00fce394e013ca431c7c9025a6 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:11:19 +0000 Subject: [PATCH 08/26] Add integration tests for doc/fam lookup when corpora mismatch --- .../routers/documents/test_get_document.py | 15 +++++++++++++++ .../routers/documents/test_get_family.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py index f7b0f665..dafa1642 100644 --- a/tests/non_search/routers/documents/test_get_document.py +++ b/tests/non_search/routers/documents/test_get_document.py @@ -176,3 +176,18 @@ def test_documents_doc_slug_when_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "The document CCLW.executive.2.2 is not published" + + +def test_documents_doc_slug_returns_404_when_corpora_mismatch( + data_client: TestClient, data_db: Session, alternative_token +): + setup_with_two_docs(data_db) + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + alternative_token, + "DocSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for DocSlug1" diff --git a/tests/non_search/routers/documents/test_get_family.py b/tests/non_search/routers/documents/test_get_family.py index ecdc620f..e149f2ca 100644 --- a/tests/non_search/routers/documents/test_get_family.py +++ b/tests/non_search/routers/documents/test_get_family.py @@ -229,3 +229,18 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted( expected_status_code=status.HTTP_404_NOT_FOUND, ) assert json_response["detail"] == "Family CCLW.family.1001.0 is not published" + + +def test_documents_family_slug_returns_404_when_corpora_mismatch( + data_client: TestClient, data_db: Session, alternative_token +): + setup_with_two_docs_one_family(data_db) + + # Test associations + json_response = _make_doc_fam_lookup_request( + data_client, + alternative_token, + "FamSlug1", + expected_status_code=status.HTTP_404_NOT_FOUND, + ) + assert json_response["detail"] == "Nothing found for FamSlug1" From 2a830dbe9e8d1b208f3592d9b55d9eef04f87e11 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:11:31 +0000 Subject: [PATCH 09/26] Add alternative corpora token --- tests/conftest.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 02c2d85c..6d6fb4cb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,6 +127,30 @@ def mock_return(_, __, ___): return af.create_configuration_token(input_str) +@pytest.fixture +def alternative_token(monkeypatch): + """Generate a valid alternative config token using TOKEN_SECRET_KEY. + + Need to generate the config token using the token secret key from + your local env file. For tests in CI, this will be the secret key in + the .env.example file, but for local development this secret key + might be different (e.g., the one for staging). This fixture works + around this. + """ + + def mock_return(_, __, ___): + return True + + corpora_ids = "UNFCCC.corpus.i00000001.n0000" + subject = "CPR" + audience = "localhost" + input_str = f"{corpora_ids};{subject};{audience}" + + af = AppTokenFactory() + monkeypatch.setattr(custom_app.AppTokenFactory, "validate", mock_return) + return af.create_configuration_token(input_str) + + @pytest.fixture def create_test_db(): """Create a test database and use it for the whole test session.""" From 024d60bbbbb266b57bcc5a48f4f2ac23e64808b0 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:21:14 +0000 Subject: [PATCH 10/26] Refactor download code --- app/repository/download.py | 29 ++++++++++++++++++++++++++++- app/service/download.py | 33 ++++++--------------------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/app/repository/download.py b/app/repository/download.py index b0381e8d..1ed90396 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -1,16 +1,43 @@ """Functions to support browsing the RDS document structure""" +import os from logging import getLogger import pandas as pd from fastapi import Depends from app.clients.db.session import get_db +from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) -def get_whole_database_dump(query, db=Depends(get_db)): +def create_query( + template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] +) -> str: + """Create download whole database query, replacing variables. + + :param str ingest_cycle_start: The current ingest cycle date. + :param list[str] allowed_corpora_ids: The corpora from which we + should allow the data to be dumped. + :return str: The SQL query to perform on the database session. + """ + corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" + return template_query.replace( # type: ignore + "{ingest_cycle_start}", ingest_cycle_start + ).replace( + "{allowed_corpora_ids}", corpora_ids + ) # type: ignore + + +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + query_template = get_query_template( + os.path.join("app", "repository", "sql", "download.sql") + ) + query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) + with db.connection() as conn: df = pd.read_sql(query, conn.connection) return df diff --git a/app/service/download.py b/app/service/download.py index 189b70fb..d5110b01 100644 --- a/app/service/download.py +++ b/app/service/download.py @@ -1,6 +1,5 @@ """Functions to support browsing the RDS document structure""" -import os import zipfile from io import BytesIO, StringIO from logging import getLogger @@ -11,29 +10,10 @@ from app.clients.db.session import get_db from app.repository.download import get_whole_database_dump -from app.repository.helpers import get_query_template _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - def replace_slug_with_qualified_url( df: pd.DataFrame, public_app_url: str, @@ -63,8 +43,10 @@ def convert_dump_to_csv(df: pd.DataFrame): return csv_buffer -def generate_data_dump_as_csv(query, db=Depends(get_db)): - df = get_whole_database_dump(query, db) +def generate_data_dump_as_csv( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + df = get_whole_database_dump(ingest_cycle_start, allowed_corpora_ids, db) csv = convert_dump_to_csv(df) csv.seek(0) return csv @@ -92,11 +74,8 @@ def create_data_download_zip_archive( ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) ): readme_buffer = generate_data_dump_readme(ingest_cycle_start) - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") - ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) - csv_buffer = generate_data_dump_as_csv(query, db) + + csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, allowed_corpora_ids, db) zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file: From c4a50b5d8b1615f46accc1cb448dccc53988b559 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:17:24 +0000 Subject: [PATCH 11/26] Geo --- app/api/api_v1/routers/geographies.py | 21 ++- app/repository/document.py | 24 ++-- app/repository/geography.py | 75 +++++++++- app/repository/sql/world_map.sql | 86 +++++++++++ makefile-docker.defs | 2 +- tests/conftest.py | 4 +- tests/non_search/conftest.py | 45 ------ .../geographies/setup_world_map_helpers.py | 61 +++++++- .../geographies/test_world_map_summary.py | 136 +++++++++++++----- 9 files changed, 348 insertions(+), 106 deletions(-) create mode 100644 app/repository/sql/world_map.sql delete mode 100644 tests/non_search/conftest.py diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/geographies.py index 6eff0cda..b92084d3 100644 --- a/app/api/api_v1/routers/geographies.py +++ b/app/api/api_v1/routers/geographies.py @@ -1,11 +1,13 @@ import logging +from typing import Annotated -from fastapi import APIRouter, Depends, HTTPException, status +from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from app.clients.db.session import get_db from app.errors import RepositoryError from app.models.geography import GeographyStatsDTO from app.repository.geography import get_world_map_stats +from app.service.custom_app import AppTokenFactory _LOGGER = logging.getLogger(__file__) @@ -13,12 +15,23 @@ @geographies_router.get("/geographies", response_model=list[GeographyStatsDTO]) -async def geographies(db=Depends(get_db)): +async def geographies( + request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) +): """Get a summary of fam stats for all geographies for world map.""" - _LOGGER.info("Getting detailed information on all geographies") + _LOGGER.info( + "Getting world map counts for all geographies", + extra={ + "props": {"app_token": str(app_token)}, + }, + ) + + # Decode the app token and validate it. + token = AppTokenFactory() + token.decode_and_validate(db, request, app_token) try: - world_map_stats = get_world_map_stats(db) + world_map_stats = get_world_map_stats(db, token.allowed_corpora_ids) if world_map_stats == []: _LOGGER.error("No stats for world map found") diff --git a/app/repository/document.py b/app/repository/document.py index 57579a26..8a3075f2 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -23,6 +23,7 @@ from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation from sqlalchemy import func +from sqlalchemy.engine import Result from sqlalchemy.orm import Session from app.models.document import ( @@ -43,8 +44,8 @@ def get_slugged_object_from_allowed_corpora_query( - template_query, slug_name: str, allowed_corpora_ids: list[str] -) -> str: + db: Session, slug_name: str, allowed_corpora_ids: list[str] +) -> Result: """Create download whole database query, replacing variables. :param str ingest_cycle_start: The current ingest cycle date. @@ -52,10 +53,13 @@ def get_slugged_object_from_allowed_corpora_query( should allow the data to be dumped. :return str: The SQL query to perform on the database session. """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace("{slug_name}", slug_name).replace( # type: ignore - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore + query_template = get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) + return db.execute( + query_template, + {"slug_name": slug_name, "allowed_corpora_ids": allowed_corpora_ids}, + ) def get_slugged_objects( @@ -75,13 +79,7 @@ def get_slugged_objects( import id or the Family import_id. """ if allowed_corpora is not None: - query_template = get_query_template( - os.path.join("app", "repository", "sql", "slug_lookup.sql") - ) - query = get_slugged_object_from_allowed_corpora_query( - query_template, slug, allowed_corpora - ) - query = db.execute(query) + query = get_slugged_object_from_allowed_corpora_query(db, slug, allowed_corpora) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( Slug.name == slug diff --git a/app/repository/geography.py b/app/repository/geography.py index e4b23a37..d0eea962 100644 --- a/app/repository/geography.py +++ b/app/repository/geography.py @@ -4,13 +4,16 @@ from typing import Optional, Sequence from db_client.models.dfce.family import ( + Corpus, Family, + FamilyCorpus, FamilyDocument, FamilyGeography, FamilyStatus, ) from db_client.models.dfce.geography import Geography from sqlalchemy import func +from sqlalchemy.dialects import postgresql from sqlalchemy.exc import OperationalError from sqlalchemy.orm import Query, Session @@ -63,7 +66,9 @@ def get_geo_subquery( return geo_subquery.subquery("geo_subquery") -def _db_count_fams_in_category_and_geo(db: Session) -> Query: +def _db_count_fams_in_category_and_geo( + db: Session, allowed_corpora: Optional[list[str]] +) -> Query: """ Query the database for the fam count per category per geo. @@ -72,6 +77,8 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query: returned. :param Session db: DB Session to perform query on. + :param Optional[list[str]] allowed_corpora: The list of allowed + corpora IDs to filter on. :return Query: A Query object containing the queries to perform. """ # Get the required Geography information and cross join each with all of the unique @@ -94,10 +101,18 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query: func.count().label("records_count"), ) .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id) + # .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id) + # .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id) .filter(Family.family_status == FamilyStatus.PUBLISHED) .group_by(Family.family_category, FamilyGeography.geography_id) .subquery("counts") ) + # if allowed_corpora is not None: + # counts = counts.where(Corpus.import_id.in_(allowed_corpora)) + # counts = counts.group_by( + # Family.family_category, FamilyGeography.geography_id + # ).subquery("counts") + # _LOGGER.info(counts) # Aggregate family_category counts per geography into a JSONB object, and if a # family_category count is missing, set the count for that category to 0 so each @@ -130,6 +145,9 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query: ) .order_by(geo_family_combinations.c.display_value) ) + print(query.statement.compile(dialect=postgresql.dialect())) + # print(str(query.statement.compile(compile_kwargs={"literal_binds": True}))) + print(render_query(query, db)) return query @@ -150,15 +168,19 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO: ) -def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]: +def get_world_map_stats( + db: Session, allowed_corpora: Optional[list[str]] +) -> list[GeographyStatsDTO]: """ Get a count of fam per category per geography for all geographies. :param db Session: The database session. + :param Optional[list[str]] allowed_corpora: The list of allowed + corpora IDs to filter on. :return list[GeographyStatsDTO]: A list of Geography stats objects """ try: - family_geo_stats = _db_count_fams_in_category_and_geo(db).all() + family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora).all() except OperationalError as e: _LOGGER.error(e) raise RepositoryError("Error querying the database for geography stats") @@ -168,3 +190,50 @@ def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]: result = [_to_dto(fgs) for fgs in family_geo_stats] return result + + +from datetime import date, datetime, timedelta + +from sqlalchemy.orm import Query + + +def render_query(statement, db_session): + """ + Generate an SQL expression string with bound parameters rendered inline + for the given SQLAlchemy statement. + WARNING: This method of escaping is insecure, incomplete, and for debugging + purposes only. Executing SQL statements with inline-rendered user values is + extremely insecure. + Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query + """ + if isinstance(statement, Query): + statement = statement.statement + dialect = db_session.bind.dialect + + class LiteralCompiler(dialect.statement_compiler): + def visit_bindparam( + self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs + ): + return self.render_literal_value(bindparam.value, bindparam.type) + + def render_array_value(self, val, item_type): + if isinstance(val, list): + return "{}".format( + ",".join([self.render_array_value(x, item_type) for x in val]) + ) + return self.render_literal_value(val, item_type) + + def render_literal_value(self, value, type_): + if isinstance(value, int): + return str(value) + elif isinstance(value, (str, date, datetime, timedelta)): + return "'{}'".format(str(value).replace("'", "''")) + elif isinstance(value, list): + return "'{{{}}}'".format( + ",".join( + [self.render_array_value(x, type_.item_type) for x in value] + ) + ) + return super(LiteralCompiler, self).render_literal_value(value, type_) + + return LiteralCompiler(dialect, statement).process(statement) diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql new file mode 100644 index 00000000..09a9db7e --- /dev/null +++ b/app/repository/sql/world_map.sql @@ -0,0 +1,86 @@ +SELECT + geo_family_combinations.display_value AS display_value, + geo_family_combinations.slug AS slug, + geo_family_combinations.value AS value, + jsonb_object_agg ( + geo_family_combinations.family_category, + coalesce(counts.records_count, 0) + ) AS counts +FROM + ( + SELECT + geography.id AS geography_id, + geography.display_value AS display_value, + geography.slug AS slug, + geography.value AS value, + anon_1.family_category AS family_category + FROM + geography, + ( + SELECT DISTINCT + family.family_category AS family_category + FROM + family + ) AS anon_1 + ) AS geo_family_combinations + LEFT OUTER JOIN ( + SELECT + family.family_category AS family_category, + family_geography.geography_id AS geography_id, + count(*) AS records_count + FROM + family + JOIN family_geography ON family.import_id = family_geography.family_import_id + WHERE + CASE + WHEN ( + NOT ( + EXISTS ( + SELECT + 1 + FROM + family_document + WHERE + family.import_id = family_document.family_import_id + ) + ) + ) THEN 'Created' + ELSE CASE + WHEN ( + ( + SELECT + count(family_document.document_status) AS count_1 + FROM + family_document + WHERE + family_document.family_import_id = family.import_id + AND family_document.document_status = 'PUBLISHED' + ) > 0 + ) THEN 'Published' + ELSE CASE + WHEN ( + ( + SELECT + count(family_document.document_status) AS count_2 + FROM + family_document + WHERE + family_document.family_import_id = family.import_id + AND family_document.document_status = 'CREATED' + ) > 0 + ) THEN 'Created' + ELSE 'Deleted' + END + END + END = 'Published' + GROUP BY + family.family_category, + family_geography.geography_id + ) AS counts ON geo_family_combinations.geography_id = counts.geography_id + AND geo_family_combinations.family_category = counts.family_category +GROUP BY + geo_family_combinations.display_value, + geo_family_combinations.slug, + geo_family_combinations.value +ORDER BY + geo_family_combinations.display_value; diff --git a/makefile-docker.defs b/makefile-docker.defs index 276a67f0..bcf2a0a2 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -123,7 +123,7 @@ test_non_search: docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/geographies/test_world_map_summary.py ${ARGS} # ---------------------------------- # tasks diff --git a/tests/conftest.py b/tests/conftest.py index 6d6fb4cb..a60cfa3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,7 +117,7 @@ def valid_token(monkeypatch): def mock_return(_, __, ___): return True - corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000" + corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000" subject = "CCLW" audience = "localhost" input_str = f"{corpora_ids};{subject};{audience}" @@ -142,7 +142,7 @@ def mock_return(_, __, ___): return True corpora_ids = "UNFCCC.corpus.i00000001.n0000" - subject = "CPR" + subject = "CCLW" audience = "localhost" input_str = f"{corpora_ids};{subject};{audience}" diff --git a/tests/non_search/conftest.py b/tests/non_search/conftest.py deleted file mode 100644 index 114d68d9..00000000 --- a/tests/non_search/conftest.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest -from db_client.models.dfce import Geography -from db_client.models.organisation import Organisation - - -@pytest.fixture -def summary_geography_family_data(test_db): - geos = [ - Geography( - display_value="A place on the land", slug="a-place-on-the-land", value="XXX" - ), - Geography( - display_value="A place in the sea", slug="a-place-in-the-sea", value="YYY" - ), - ] - organisations = [Organisation(name="test org")] - - test_db.add_all(geos) - test_db.add_all(organisations) - test_db.flush() - - # Now setup the Documents/Families - - ## WORKING HERE - documents = [] - families = [] - - test_db.add_all(documents) - test_db.add_all(families) - test_db.flush() - - # Now some events - events = [] - - test_db.add_all(events) - - test_db.commit() - yield { - "db": test_db, - "docs": documents, - "families": documents, - "geos": geos, - "organisations": organisations, - "events": events, - } diff --git a/tests/non_search/routers/geographies/setup_world_map_helpers.py b/tests/non_search/routers/geographies/setup_world_map_helpers.py index 1752d0e2..b46f9f91 100644 --- a/tests/non_search/routers/geographies/setup_world_map_helpers.py +++ b/tests/non_search/routers/geographies/setup_world_map_helpers.py @@ -1,4 +1,7 @@ +from typing import Optional + from db_client.functions.dfce_helpers import add_collections, add_document, add_families +from fastapi import status from sqlalchemy.orm import Session from tests.non_search.setup_helpers import ( @@ -6,6 +9,26 @@ get_default_documents, ) +WORLD_MAP_STATS_ENDPOINT = "/api/v1/geographies" +TEST_HOST = "http://localhost:3000/" + + +def _make_world_map_lookup_request( + client, + token, + expected_status_code: int = status.HTTP_200_OK, + origin: Optional[str] = TEST_HOST, +): + headers = ( + {"app-token": token} + if origin is None + else {"app-token": token, "origin": origin} + ) + + response = client.get(f"{WORLD_MAP_STATS_ENDPOINT}", headers=headers) + assert response.status_code == expected_status_code, response.text + return response.json() + def _add_published_fams_and_docs(db: Session): # Collection @@ -13,6 +36,27 @@ def _add_published_fams_and_docs(db: Session): add_collections(db, collections=[collection1]) # Family + Document + events + document0 = { + "title": "UnfcccDocument0", + "slug": "UnfcccDocSlug0", + "md5_sum": None, + "url": "http://another_somewhere", + "content_type": None, + "import_id": "UNFCCC.executive.3.3", + "language_variant": None, + "status": "PUBLISHED", + "metadata": {"role": ["MAIN"], "type": ["Order"]}, + "languages": [], + "events": [ + { + "import_id": "UNFCCC.Event.3.0", + "title": "Published", + "date": "2019-12-25", + "type": "Passed/Approved", + "status": "OK", + } + ], + } document1, document2 = get_default_documents() document3 = { "title": "Document3", @@ -36,6 +80,20 @@ def _add_published_fams_and_docs(db: Session): ], } + unfccc_fam = { + "import_id": "UNFCCC.family.0000.0", + "corpus_import_id": "UNFCCC.corpus.i00000001.n0000", + "title": "UnfcccFam0", + "slug": "UnfcccFamSlug0", + "description": "Summary0", + "geography_id": [2, 5], + "category": "UNFCCC", + "documents": [], + "metadata": { + "size": "small", + "color": "blue", + }, + } family0 = { "import_id": "CCLW.family.0000.0", "corpus_import_id": "CCLW.corpus.i00000001.n0000", @@ -93,11 +151,12 @@ def _add_published_fams_and_docs(db: Session): }, } + unfccc_fam["documents"] = [document0] family0["documents"] = [] family1["documents"] = [document1] family2["documents"] = [document2] family3["documents"] = [document3] - add_families(db, families=[family0, family1, family2, family3]) + add_families(db, families=[family0, family1, family2, family3, unfccc_fam]) def setup_all_docs_published_world_map(db: Session): diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py index 92bea770..c60c6d48 100644 --- a/tests/non_search/routers/geographies/test_world_map_summary.py +++ b/tests/non_search/routers/geographies/test_world_map_summary.py @@ -1,9 +1,16 @@ import pytest -from db_client.models.dfce.family import Family, FamilyGeography, FamilyStatus +from db_client.models.dfce.family import ( + Corpus, + Family, + FamilyCorpus, + FamilyGeography, + FamilyStatus, +) from db_client.models.dfce.geography import Geography from fastapi import status from tests.non_search.routers.geographies.setup_world_map_helpers import ( + _make_world_map_lookup_request, setup_all_docs_published_world_map, setup_mixed_doc_statuses_world_map, ) @@ -11,12 +18,8 @@ EXPECTED_NUM_FAM_CATEGORIES = 3 -def _get_expected_keys(): - return ["display_name", "iso_code", "slug", "family_counts"] - - -def _url_under_test() -> str: - return "/api/v1/geographies" +def _test_has_expected_keys(keys: list[str]) -> bool: + return set(["display_name", "iso_code", "slug", "family_counts"]) == set(keys) def _find_geography_index(lst, key, value): @@ -34,8 +37,8 @@ def test_geo_table_populated(data_db): @pytest.mark.parametrize( ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), [ - ("India", 1, 1, 1), - ("Afghanistan", 0, 0, 1), + ("India", 1, 1, 2), + ("Afghanistan", 0, 0, 2), ], ) def test_endpoint_returns_ok_all_docs_per_family_published( @@ -45,20 +48,18 @@ def test_endpoint_returns_ok_all_docs_per_family_published( expected_exec, expected_leg, expected_unfccc, + valid_token, ): """Check endpoint returns 200 on success""" setup_all_docs_published_world_map(data_db) - response = data_client.get(_url_under_test()) - assert response.status_code == status.HTTP_200_OK - resp_json = response.json() + + resp_json = _make_world_map_lookup_request(data_client, valid_token) assert len(resp_json) > 1 idx = _find_geography_index(resp_json, "display_name", geo_display_value) resp = resp_json[idx] - assert set(["display_name", "iso_code", "slug", "family_counts"]) == set( - resp.keys() - ) + assert _test_has_expected_keys(resp.keys()) family_geos = ( data_db.query(Family) @@ -72,20 +73,20 @@ def test_endpoint_returns_ok_all_docs_per_family_published( assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES assert sum(resp["family_counts"].values()) == len(family_geos) + assert resp["family_counts"]["EXECUTIVE"] == expected_exec + assert resp["family_counts"]["LEGISLATIVE"] == expected_leg + assert resp["family_counts"]["UNFCCC"] == expected_unfccc assert ( sum(resp["family_counts"].values()) == expected_exec + expected_leg + expected_unfccc ) - assert resp["family_counts"]["EXECUTIVE"] == expected_exec - assert resp["family_counts"]["LEGISLATIVE"] == expected_leg - assert resp["family_counts"]["UNFCCC"] == expected_unfccc @pytest.mark.parametrize( ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), [ - ("India", 1, 1, 2), - ("Afghanistan", 0, 0, 1), + ("India", 0, 0, 2), + ("Afghanistan", 0, 0, 2), ], ) def test_endpoint_returns_ok_some_docs_per_published_family_unpublished( @@ -95,20 +96,18 @@ def test_endpoint_returns_ok_some_docs_per_published_family_unpublished( expected_exec, expected_leg, expected_unfccc, + valid_token, ): """Check endpoint returns 200 & discounts CREATED & DELETED docs""" setup_mixed_doc_statuses_world_map(data_db) - response = data_client.get(_url_under_test()) - assert response.status_code == status.HTTP_200_OK - resp_json = response.json() + + resp_json = _make_world_map_lookup_request(data_client, valid_token) assert len(resp_json) > 1 idx = _find_geography_index(resp_json, "display_name", geo_display_value) resp = resp_json[idx] - assert set(["display_name", "iso_code", "slug", "family_counts"]) == set( - resp.keys() - ) + assert _test_has_expected_keys(resp.keys()) fams = ( data_db.query(Family) @@ -122,27 +121,90 @@ def test_endpoint_returns_ok_some_docs_per_published_family_unpublished( assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES assert sum(resp["family_counts"].values()) == len(fams) + assert resp["family_counts"]["EXECUTIVE"] == expected_exec + assert resp["family_counts"]["LEGISLATIVE"] == expected_leg + assert resp["family_counts"]["UNFCCC"] == expected_unfccc assert ( sum(resp["family_counts"].values()) == expected_exec + expected_leg + expected_unfccc ) - assert resp["family_counts"]["EXECUTIVE"] == expected_exec - assert resp["family_counts"]["LEGISLATIVE"] == expected_leg - assert resp["family_counts"]["UNFCCC"] == expected_unfccc -def test_endpoint_returns_404_when_not_found(data_client): +def test_endpoint_returns_404_when_not_found(data_client, valid_token): """Test the endpoint returns a 404 when no world map stats found""" - response = data_client.get(_url_under_test()) - assert response.status_code == status.HTTP_404_NOT_FOUND - data = response.json() + data = _make_world_map_lookup_request( + data_client, valid_token, expected_status_code=status.HTTP_404_NOT_FOUND + ) assert data["detail"] == "No stats for world map found" @pytest.mark.skip(reason="Bad repo and rollback mocks need rewriting") -def test_endpoint_returns_503_when_error(data_client): +def test_endpoint_returns_503_when_error(data_client, valid_token): """Test the endpoint returns a 503 on db error""" - response = data_client.get(_url_under_test()) - assert response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE - data = response.json() + data = _make_world_map_lookup_request( + data_client, + valid_token, + expected_status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + ) assert data["detail"] == "Database error" + + +@pytest.mark.parametrize( + ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), + [ + ("India", 0, 0, 1), + ("Afghanistan", 0, 0, 1), + ], +) +def test_endpoint_returns_different_results_with_alt_token( + data_db, + data_client, + geo_display_value, + expected_exec, + expected_leg, + expected_unfccc, + alternative_token, +): + """Check endpoint returns 200 & only counts CCLW docs""" + setup_all_docs_published_world_map(data_db) + + fam = ( + data_db.query(Family, FamilyCorpus.corpus_import_id) + .filter(Family.import_id == "UNFCCC.family.0000.0") + .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id) + .one() + ) + assert fam + print(fam) + + resp_json = _make_world_map_lookup_request(data_client, alternative_token) + assert len(resp_json) > 1 + + idx = _find_geography_index(resp_json, "display_name", geo_display_value) + resp = resp_json[idx] + + assert _test_has_expected_keys(resp.keys()) + + fams = ( + data_db.query(Family.import_id) + .filter(Family.family_status == FamilyStatus.PUBLISHED) + .filter(Geography.display_value == geo_display_value) + .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id) + .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id) + .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id) + .join(Geography, Geography.id == FamilyGeography.geography_id) + .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000") + .all() + ) + print(fams) + + assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES + assert sum(resp["family_counts"].values()) == len(fams) + + assert resp["family_counts"]["EXECUTIVE"] == expected_exec + assert resp["family_counts"]["LEGISLATIVE"] == expected_leg + assert resp["family_counts"]["UNFCCC"] == expected_unfccc + assert ( + sum(resp["family_counts"].values()) + == expected_exec + expected_leg + expected_unfccc + ) From 84800988649d4332de738a884fc4048dcbd422dd Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:59:33 +0000 Subject: [PATCH 12/26] Merge in main --- .git-blame-ignore-revs | 4 + .trunk/configs/.sqlfluff | 30 + .trunk/trunk.yaml | 50 ++ app/repository/document.py | 47 +- app/repository/download.py | 46 +- app/repository/geography.py | 51 -- app/repository/helpers.py | 6 +- app/repository/sql/download.sql | 596 +++++++++++------- app/repository/sql/pipeline.sql | 503 ++++++++------- app/repository/sql/slug_lookup.sql | 43 +- app/repository/sql/world_map.sql | 82 +-- makefile-docker.defs | 2 +- poetry.lock | 10 +- pyproject.toml | 4 +- .../schemas/document_passage.sd | 66 +- .../schemas/family_document.sd | 92 +++ 16 files changed, 979 insertions(+), 653 deletions(-) create mode 100644 .trunk/configs/.sqlfluff diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 9a66c0e8..8c4c0200 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -23,3 +23,7 @@ # Updating the test data file for document passages to be indent=2 44624dcd1fa0835708bd9187a39bb0da8a31cd03 + +# Fix SQL query formatting +047766a85f086fc0986a6f2b49fee9d73fa219e8 +ab3476708920c5760f058ec40d14d008f94f5bad diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff new file mode 100644 index 00000000..de7aacd3 --- /dev/null +++ b/.trunk/configs/.sqlfluff @@ -0,0 +1,30 @@ +[sqlfluff] +dialect = postgres +exclude_rules = LT02, LT09 + +[sqlfluff:indentation] +indented_ctes = True + +[sqlfluff:layout:type:colon] +spacing_before = single +spacing_after = single + +[sqlfluff:layout:type:parameter] +spacing_before = touch +spacing_after = any + +[sqlfluff:rules:references.special_chars] +allow_space_in_identifier = True +additional_allowed_characters = ["/", "_", "-", "(", ")"] + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.identifiers] +extended_capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.functions] +extended_capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 6d746ff9..31ff5439 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,6 +5,14 @@ version: 0.1 cli: version: 1.22.0 +tools: + definitions: + - name: sqlfluff + runtime: python + package: sqlfluff + shims: [sqlfluff] + known_good_version: 1.4.5 + # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -27,6 +35,7 @@ lint: disabled: - hadolint - oxipng + definitions: - name: bandit direct_configs: [bandit.yaml] @@ -34,6 +43,45 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} + - name: sqlfluff + files: [sql, sql-j2, dml, ddl] + tools: [sqlfluff] + description: A dialect-flexible and configurable SQL linter + known_good_version: 1.4.5 + direct_configs: + - .sqlfluff + affects_cache: + - pyproject.toml + suggest_if: config_present + commands: + - name: lint + run: sqlfluff lint ${target} --format json --nofail + output: sarif + success_codes: [0] + read_output_from: stdout + parser: + runtime: python + run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py + + - name: fix + version: ">=3.0.0" + run: sqlfluff fix ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + + - name: format + run: sqlfluff format ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + ignore: - linters: [ALL] paths: @@ -45,6 +93,8 @@ lint: - LICENSE.md enabled: + - sqlfluff@3.2.5: + commands: [lint, fix, format] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 diff --git a/app/repository/document.py b/app/repository/document.py index 8a3075f2..73d19d47 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Database helper functions for the documents entity.""" import logging import os @@ -22,9 +18,9 @@ from db_client.models.dfce.metadata import FamilyMetadata from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation -from sqlalchemy import func -from sqlalchemy.engine import Result +from sqlalchemy import bindparam, func, text from sqlalchemy.orm import Session +from sqlalchemy.types import ARRAY, String from app.models.document import ( CollectionOverviewResponse, @@ -43,25 +39,6 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_object_from_allowed_corpora_query( - db: Session, slug_name: str, allowed_corpora_ids: list[str] -) -> Result: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - query_template = get_query_template( - os.path.join("app", "repository", "sql", "slug_lookup.sql") - ) - return db.execute( - query_template, - {"slug_name": slug_name, "allowed_corpora_ids": allowed_corpora_ids}, - ) - - def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: @@ -78,8 +55,22 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - if allowed_corpora is not None: - query = get_slugged_object_from_allowed_corpora_query(db, slug, allowed_corpora) + if allowed_corpora not in [None, []]: + query_template = text( + get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) + ) + + query_template = query_template.bindparams( + bindparam("slug_name", type_=String), + bindparam( + "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String) + ), + ) + query = db.execute( + query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} + ) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( Slug.name == slug diff --git a/app/repository/download.py b/app/repository/download.py index 1ed90396..33592cc0 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -5,6 +5,8 @@ import pandas as pd from fastapi import Depends +from sqlalchemy import bindparam, text +from sqlalchemy.types import ARRAY, DATETIME, String from app.clients.db.session import get_db from app.repository.helpers import get_query_template @@ -12,32 +14,34 @@ _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we + :param list[str] corpora_ids: The corpora from which we should allow the data to be dumped. - :return str: The SQL query to perform on the database session. + :return pd.DataFrame: A DataFrame containing the results of the SQL + query that gets the whole database dump in our desired format. """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - -def get_whole_database_dump( - ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) -): - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") + query = text( + get_query_template(os.path.join("app", "repository", "sql", "download.sql")) + ).bindparams( + bindparam("ingest_cycle_start", type_=DATETIME), + bindparam( + "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String) + ), ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) with db.connection() as conn: - df = pd.read_sql(query, conn.connection) + result = conn.execute( + query, + { + "ingest_cycle_start": ingest_cycle_start, + "allowed_corpora_ids": allowed_corpora_ids, + }, + ) + columns = result.keys() + df = pd.DataFrame(result.fetchall(), columns=columns) return df diff --git a/app/repository/geography.py b/app/repository/geography.py index d0eea962..b035fd70 100644 --- a/app/repository/geography.py +++ b/app/repository/geography.py @@ -4,9 +4,7 @@ from typing import Optional, Sequence from db_client.models.dfce.family import ( - Corpus, Family, - FamilyCorpus, FamilyDocument, FamilyGeography, FamilyStatus, @@ -146,8 +144,6 @@ def _db_count_fams_in_category_and_geo( .order_by(geo_family_combinations.c.display_value) ) print(query.statement.compile(dialect=postgresql.dialect())) - # print(str(query.statement.compile(compile_kwargs={"literal_binds": True}))) - print(render_query(query, db)) return query @@ -190,50 +186,3 @@ def get_world_map_stats( result = [_to_dto(fgs) for fgs in family_geo_stats] return result - - -from datetime import date, datetime, timedelta - -from sqlalchemy.orm import Query - - -def render_query(statement, db_session): - """ - Generate an SQL expression string with bound parameters rendered inline - for the given SQLAlchemy statement. - WARNING: This method of escaping is insecure, incomplete, and for debugging - purposes only. Executing SQL statements with inline-rendered user values is - extremely insecure. - Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query - """ - if isinstance(statement, Query): - statement = statement.statement - dialect = db_session.bind.dialect - - class LiteralCompiler(dialect.statement_compiler): - def visit_bindparam( - self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs - ): - return self.render_literal_value(bindparam.value, bindparam.type) - - def render_array_value(self, val, item_type): - if isinstance(val, list): - return "{}".format( - ",".join([self.render_array_value(x, item_type) for x in val]) - ) - return self.render_literal_value(val, item_type) - - def render_literal_value(self, value, type_): - if isinstance(value, int): - return str(value) - elif isinstance(value, (str, date, datetime, timedelta)): - return "'{}'".format(str(value).replace("'", "''")) - elif isinstance(value, list): - return "'{{{}}}'".format( - ",".join( - [self.render_array_value(x, type_.item_type) for x in value] - ) - ) - return super(LiteralCompiler, self).render_literal_value(value, type_) - - return LiteralCompiler(dialect, statement).process(statement) diff --git a/app/repository/helpers.py b/app/repository/helpers.py index e976683b..958e0b38 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Helper functions for the repository layer.""" from functools import lru_cache diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 8a807080..15bbf8ac 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,243 +1,355 @@ -WITH -deduplicated_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc -), -unique_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_family_slugs as ( - SELECT - deduplicated_family_slugs.family_import_id as "family_import_id", - deduplicated_family_slugs.created as "created", - deduplicated_family_slugs.name as "name" - FROM deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id as "family_import_id", - unique_family_slugs.created as "created", - unique_family_slugs.name as "name" - FROM unique_family_slugs - order by family_import_id desc, created desc - ), deduplicated_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc -), -unique_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_doc_slugs as ( - SELECT - deduplicated_doc_slugs.family_document_import_id - as "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id as "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM unique_doc_slugs - order by family_document_import_id desc, created desc - ), event_dates as ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date, - max(family_event.date::date) last_changed - FROM - family_event - GROUP BY - family_import_id -) +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date, + MAX(family_event.date::DATE) AS last_changed + FROM + family_event + GROUP BY + family_event.family_import_id + ), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ), + +n1 AS ( + SELECT + collection_family.family_import_id, + STRING_AGG(collection.import_id, ';') AS collection_import_ids, + STRING_AGG(collection.title, ';') AS collection_titles, + STRING_AGG(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN + collection_family + ON collection.import_id = collection_family.collection_import_id + GROUP BY + collection_family.family_import_id + ) + SELECT -ds.name as "Document ID", -p.title as "Document Title", -fs.name as "Family ID", -f.title as "Family Title", -f.description as "Family Summary", -n1.collection_titles as "Collection Title(s)", -n1.collection_descriptions as "Collection Description(s)", -INITCAP(d.valid_metadata::json#>>'{ - role,0}') as -"Document Role", -d.variant_name as "Document Variant", -p.source_url as "Document Content URL", -INITCAP(d.valid_metadata::json#>>'{ - type,0}') as -"Document Type", -CASE - WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' - ELSE INITCAP(f.family_category::TEXT) -END "Category", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'framework')), ';') -as "Framework", -n2.language as "Language", -o.name as "Source", -fg.geo_isos as "Geography ISOs", -fg.geo_display_values as "Geographies", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'topic')), ';') -as "Topic/Response", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'hazard')), ';') -as "Hazard", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'sector')), ';') -as "Sector", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'keyword')), ';') -as "Keyword", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'instrument')), ';') -as "Instrument", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author')), ';') -as "Author", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author_type')), ';') -as "Author Type", -fp.published_date as "First event in timeline", -fp.last_changed as "Last event in timeline", -n3.event_type_names as "Full timeline of events (types)", -n3.event_dates as "Full timeline of events (dates)", -d.created::date as "Date Added to System", -f.last_modified::date as "Last ModIFied on System", -d.import_id as "Internal Document ID", -f.import_id as "Internal Family ID", -n1.collection_import_ids as "Internal Collection ID(s)" -FROM physical_document p -JOIN family_document d -ON p.id = d.physical_document_id -JOIN family f -ON d.family_import_id = f.import_id -FULL JOIN ( - SELECT - family_geography.family_import_id as "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography - ON geography.id = family_geography.geography_id - GROUP BY family_geography.family_import_id -) fg ON fg.family_import_id=f.import_id -join family_corpus fc -on f.import_id = fc.family_import_id -join corpus c -on fc.corpus_import_id = c.import_id -join organisation o -on c.organisation_id = o.id -join family_metadata fm -on fm.family_import_id = f.import_id -FULL JOIN ( - SELECT - collection_family.family_import_id as "family_import_id", - string_agg(collection.import_id, ';') AS collection_import_ids, - string_agg(collection.title, ';') AS collection_titles, - string_agg(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family - ON collection_family.collection_import_id = collection.import_id - GROUP BY collection_family.family_import_id -) n1 ON n1.family_import_id=f.import_id -left JOIN ( - SELECT - p.id as "id", - string_agg(l.name, ';' ORDER BY l.name) AS language - FROM physical_document p - left join physical_document_language pdl - on pdl.document_id = p.id - left join language l - on l.id = pdl.language_id - GROUP BY p.id -) n2 ON n2.id=d.physical_document_id -FULL JOIN ( - SELECT - family_event.family_import_id, - string_agg(family_event.import_id, ';') AS event_import_ids, - string_agg(family_event.title, ';') AS event_titles, - string_agg(family_event.event_type_name, ';') AS event_type_names, - string_agg(family_event.date::date::text, ';') AS event_dates - FROM family_event - INNER JOIN family ON family.import_id = family_event.family_import_id - GROUP BY family_event.family_import_id -) n3 ON n3.family_import_id=f.import_id -LEFT JOIN most_recent_doc_slugs ds -on ds.family_document_import_id = d.import_id -LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id -LEFT JOIN event_dates fp on fp.family_import_id = f.import_id -WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids}) -ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id + ds.name AS "Document ID", + p.title AS "Document Title", + fs.name AS "Family ID", + f.title AS "Family Title", + f.description AS "Family Summary", + n1.collection_titles AS "Collection Title(s)", + n1.collection_descriptions AS "Collection Description(s)", + d.variant_name AS "Document Variant", + p.source_url AS "Document Content URL", + language_agg.display_name AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::DATE AS "Date Added to System", + f.last_modified::DATE AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)", + INITCAP(d.valid_metadata::JSON #>> '{ + role,0}') AS "Document Role", + INITCAP(d.valid_metadata::JSON #>> '{ + type,0}') AS "Document Type", + CASE + WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + ELSE INITCAP(f.family_category::TEXT) + END AS "Category", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework') + ), + ';' + ) AS "Framework", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic') + ), + ';' + ) AS "Topic/Response", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard') + ), + ';' + ) AS "Hazard", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector') + ), + ';' + ) AS "Sector", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword') + ), + ';' + ) AS "Keyword", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument') + ), + ';' + ) AS "Instrument", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author') + ), + ';' + ) AS "Author", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type') + ), + ';' + ) AS "Author Type" +FROM + physical_document AS p + INNER JOIN family_document AS d ON p.id = d.physical_document_id + INNER JOIN family AS f ON d.family_import_id = f.import_id + FULL JOIN fg ON f.import_id = fg.family_import_id + INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id + INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id + INNER JOIN organisation AS o ON c.organisation_id = o.id + INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id + FULL JOIN n1 ON f.import_id = n1.family_import_id + LEFT JOIN ( + SELECT + p.id, + STRING_AGG( + l.name, + ';' + ORDER BY + l.name + ) AS display_name + FROM + physical_document AS p + LEFT JOIN + physical_document_language AS pdl + ON p.id = pdl.document_id + LEFT JOIN language AS l ON pdl.language_id = l.id + GROUP BY + p.id + ) AS language_agg ON d.physical_document_id = language_agg.id + FULL JOIN ( + SELECT + family_event.family_import_id, + STRING_AGG(family_event.import_id, ';') AS event_import_ids, + STRING_AGG(family_event.title, ';') AS event_titles, + STRING_AGG(family_event.event_type_name, ';') AS event_type_names, + STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates + FROM + family_event + INNER JOIN + family + ON family_event.family_import_id = family.import_id + GROUP BY + family_event.family_import_id + ) AS n3 ON f.import_id = n3.family_import_id + LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id + LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id + LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.last_modified < :ingest_cycle_start + AND fc.corpus_import_id = ANY(:allowed_corpora_ids) +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + n1.family_import_id ASC diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index af6023e6..7a5d0e40 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,36 +1,43 @@ -WITH deduplicated_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - Count(*) > 1 ) duplicates - left join + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug - ON duplicates.family_import_id = slug.family_import_id + ON duplicates.family_import_id = slug.family_import_id ORDER BY slug.family_import_id DESC, slug.created DESC, - slug.ctid DESC ), - unique_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + slug.ctid DESC +), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count FROM slug WHERE @@ -38,219 +45,235 @@ WITH deduplicated_family_slugs AS ( SELECT GROUP BY slug.family_import_id HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_family_slugs AS ( SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION - ALL SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC +), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC ), deduplicated_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) > 1 ) duplicates - left join slug - ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - unique_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION - ALL SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", - CASE - WHEN f.family_category IN ('UNFCCC', - 'MCF') THEN Upper(f.family_category::text) - ELSE Initcap(f.family_category::text) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json#>>'{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" - FROM - physical_document p - join - family_document d - ON p.id = d.physical_document_id - join - family f - ON d.family_import_id = f.import_id full - join - ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, - ';') AS geo_isos, - string_agg(geography.display_value, - ';') AS geo_display_values - FROM - geography - inner join - family_geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg - ON fg.family_import_id=f.import_id - join - family_corpus fc - ON f.import_id = fc.family_import_id - join - corpus c - ON fc.corpus_import_id = c.import_id - join - organisation o - ON c.organisation_id = o.id - join - family_metadata fm - ON fm.family_import_id = f.import_id - left outer join - ( - SELECT - family_document.import_id AS family_document_import_id, - json_agg(DISTINCT(LANGUAGE.name)) AS languages - FROM - family_document - join - physical_document_language - ON physical_document_language.document_id = family_document.physical_document_id - join - LANGUAGE - ON LANGUAGE.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs - ON langs.family_document_import_id = d.import_id - left outer join - ( - SELECT - family_geography.family_import_id AS family_import_id, - json_agg(DISTINCT(geography.value)) AS geographies - FROM - family_geography - join - geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos - ON geos.family_import_id = f.import_id - left join - most_recent_doc_slugs ds - ON ds.family_document_import_id = d.import_id - left join - most_recent_family_slugs fs - ON fs.family_import_id = f.import_id - left join - event_dates fp - ON fp.family_import_id = f.import_id + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC +), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN + COUNT(*) FILTER ( WHERE - d.document_status != 'DELETED' - AND fg.family_import_id = f.import_id - ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - f.import_id + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 + THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event + GROUP BY + family_event.family_import_id +), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id +), + +geos AS ( + SELECT + family_geography.family_import_id, + JSON_AGG(DISTINCT geography.value) AS geographies + FROM + family_geography + INNER JOIN geography ON family_geography.geography_id = geography.id + GROUP BY + family_geography.family_import_id +) + +SELECT + f.title AS family_title, + p.title AS physical_document_title, + f.description AS family_description, + fp.published_date AS family_published_date, + d.import_id AS family_document_import_id, + ds.name AS family_document_slug, + f.import_id AS family_import_id, + fs.name AS family_slug, + p.source_url AS physical_document_source_url, + o.name AS organisation_name, + geos.geographies, + c.import_id AS corpus_import_id, + c.corpus_type_name, + langs.languages, + fm.value AS family_metadata, + d.valid_metadata AS family_document_metadata, + CASE + WHEN + f.family_category IN ('UNFCCC', 'MCF') + THEN UPPER(f.family_category::TEXT) + ELSE INITCAP(f.family_category::TEXT) + END AS family_category, + d.valid_metadata::JSON #>> '{type,0}' AS family_document_type +FROM + physical_document AS p +INNER JOIN family_document AS d ON p.id = d.physical_document_id +INNER JOIN family AS f ON d.family_import_id = f.import_id +FULL JOIN fg ON f.import_id = fg.family_import_id +INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id +INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id +INNER JOIN organisation AS o ON c.organisation_id = o.id +INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id +LEFT OUTER JOIN ( + SELECT + family_document.import_id AS family_document_import_id, + JSON_AGG(DISTINCT language.name) AS languages + FROM + family_document + INNER JOIN + physical_document_language + ON + family_document.physical_document_id + = physical_document_language.document_id + INNER JOIN + language + ON physical_document_language.language_id = language.id + GROUP BY + family_document.import_id +) AS langs ON d.import_id = langs.family_document_import_id +LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id +LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id +LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id +LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.document_status != 'DELETED' + AND fg.family_import_id = f.import_id +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + f.import_id ASC diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 9d649067..09cb2e69 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,20 +1,33 @@ -SELECT - slug.family_document_import_id, slug.family_import_id +-- First query for family document slugs +SELECT DISTINCT + slug.family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family ON family.import_id = slug.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}) + INNER JOIN family_document + ON slug.family_document_import_id = family_document.import_id + INNER JOIN family + ON family_document.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) UNION -SELECT - slug.family_document_import_id, slug.family_import_id +-- Second query for family slugs +SELECT DISTINCT + NULL AS family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id -LEFT JOIN family ON family.import_id = family_document.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}); + INNER JOIN family + ON slug.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql index 09a9db7e..51307212 100644 --- a/app/repository/sql/world_map.sql +++ b/app/repository/sql/world_map.sql @@ -1,36 +1,13 @@ -SELECT - geo_family_combinations.display_value AS display_value, - geo_family_combinations.slug AS slug, - geo_family_combinations.value AS value, - jsonb_object_agg ( - geo_family_combinations.family_category, - coalesce(counts.records_count, 0) - ) AS counts -FROM - ( - SELECT - geography.id AS geography_id, - geography.display_value AS display_value, - geography.slug AS slug, - geography.value AS value, - anon_1.family_category AS family_category - FROM - geography, - ( - SELECT DISTINCT - family.family_category AS family_category - FROM - family - ) AS anon_1 - ) AS geo_family_combinations - LEFT OUTER JOIN ( +WITH counts AS ( SELECT - family.family_category AS family_category, - family_geography.geography_id AS geography_id, - count(*) AS records_count + family.family_category, + family_geography.geography_id, + COUNT(*) AS records_count FROM family - JOIN family_geography ON family.import_id = family_geography.family_import_id + INNER JOIN + family_geography + ON family.import_id = family_geography.family_import_id WHERE CASE WHEN ( @@ -45,11 +22,10 @@ FROM ) ) ) THEN 'Created' - ELSE CASE - WHEN ( + WHEN ( ( SELECT - count(family_document.document_status) AS count_1 + COUNT(family_document.document_status) AS count_1 FROM family_document WHERE @@ -57,11 +33,10 @@ FROM AND family_document.document_status = 'PUBLISHED' ) > 0 ) THEN 'Published' - ELSE CASE - WHEN ( + WHEN ( ( SELECT - count(family_document.document_status) AS count_2 + COUNT(family_document.document_status) AS count_2 FROM family_document WHERE @@ -69,14 +44,41 @@ FROM AND family_document.document_status = 'CREATED' ) > 0 ) THEN 'Created' - ELSE 'Deleted' - END - END + ELSE 'Deleted' END = 'Published' GROUP BY family.family_category, family_geography.geography_id - ) AS counts ON geo_family_combinations.geography_id = counts.geography_id + ) + +SELECT + geo_family_combinations.display_value, + geo_family_combinations.slug, + geo_family_combinations.value, + JSONB_OBJECT_AGG( + geo_family_combinations.family_category, + COALESCE(counts.records_count, 0) + ) AS counts +FROM + ( + SELECT + geography.id AS geography_id, + geography.display_value, + geography.slug, + geography.value, + anon_1.family_category + FROM + geography, + ( + SELECT DISTINCT + family.family_category + FROM + family + ) AS anon_1 + ) AS geo_family_combinations + LEFT OUTER JOIN + counts + ON geo_family_combinations.geography_id = counts.geography_id AND geo_family_combinations.family_category = counts.family_category GROUP BY geo_family_combinations.display_value, diff --git a/makefile-docker.defs b/makefile-docker.defs index bcf2a0a2..b41a9358 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -123,7 +123,7 @@ test_non_search: docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/geographies/test_world_map_summary.py ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS} # ---------------------------------- # tasks diff --git a/poetry.lock b/poetry.lock index 2e443fb5..6d497b9e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -702,13 +702,13 @@ files = [ [[package]] name = "cpr-sdk" -version = "1.9.3" +version = "1.9.5" description = "" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "cpr_sdk-1.9.3-py3-none-any.whl", hash = "sha256:1cd725de96d3af7a1f74c5d5eab18b207702944549f945e5392a039242c948b8"}, - {file = "cpr_sdk-1.9.3.tar.gz", hash = "sha256:f7ee60d81b2c9520cae237742582052472b5581601a8bb33862194bae8c4e0e1"}, + {file = "cpr_sdk-1.9.5-py3-none-any.whl", hash = "sha256:dd32806499b5bb44c98be1f4135b88406f1a77abcf60c7d4f61ac740de979da3"}, + {file = "cpr_sdk-1.9.5.tar.gz", hash = "sha256:addc22e557381935ac66c95721312c4a37080fda7419381971cdf6e7cb331fe0"}, ] [package.dependencies] @@ -4250,4 +4250,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "40e1911a3a0b0211027e7326d521d1fc9cdf8aa6c798279f1d5134a4bc2a5f57" +content-hash = "d96225e60602c52c47631f4a12f8519bcb5d1e2f5d2b15c2f57760db6b28c33c" diff --git a/pyproject.toml b/pyproject.toml index 11d675fa..849d3b4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.13" +version = "1.19.15" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] @@ -10,7 +10,7 @@ python = "^3.10" Authlib = "^0.15.5" bcrypt = "^3.2.0" boto3 = "^1.26" -cpr_sdk = { version = "1.9.3", extras = ["vespa"] } +cpr_sdk = { version = "1.9.5", extras = ["vespa"] } fastapi = "^0.104.1" fastapi-health = "^0.4.0" fastapi-pagination = { extras = ["sqlalchemy"], version = "^0.12.19" } diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd index 29492a3a..863929db 100644 --- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd +++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd @@ -1,5 +1,10 @@ schema document_passage { + field text_block_not_stemmed type string { + indexing: input text_block | summary | index + stemming: none + } + document document_passage { field search_weights_ref type reference { @@ -134,6 +139,37 @@ schema document_passage { summary concepts {} } + document-summary search_summary_with_tokens { + summary family_name {} + summary family_description {} + summary family_import_id {} + summary family_slug {} + summary family_category {} + summary family_publication_ts {} + summary family_geography {} + summary family_geographies {} + summary family_source {} + summary document_import_id {} + summary document_slug {} + summary document_languages {} + summary document_content_type {} + summary document_cdn_object {} + summary document_source_url {} + summary corpus_import_id {} + summary corpus_type_name {} + summary metadata {} + summary text_block {} + summary text_block_id {} + summary text_block_type {} + summary text_block_page {} + summary text_block_coords {} + summary concepts {} + summary text_block_tokens { + source: text_block + tokens + } + } + rank-profile exact inherits default { function text_score() { expression: attribute(passage_weight) * fieldMatch(text_block) @@ -141,7 +177,17 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() fieldMatch(text_block) + } + + rank-profile exact_not_stemmed inherits default { + function text_score() { + expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed) + } + first-phase { + expression: text_score() + } + match-features: text_score() fieldMatch(text_block) } rank-profile hybrid_no_closeness inherits default { @@ -151,7 +197,7 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() bm25(text_block) } rank-profile hybrid inherits default { @@ -164,6 +210,20 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() bm25(text_block) closeness(text_embedding) + } + + rank-profile hybrid_custom_weight inherits default { + inputs { + query(query_embedding) tensor(x[768]) + query(bm25_weight) double + } + function text_score() { + expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding)) + } + first-phase { + expression: text_score() + } + match-features: text_score() bm25(text_block) closeness(text_embedding) } } diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd index e62d6df5..e56963b2 100644 --- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd +++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd @@ -1,5 +1,15 @@ schema family_document { + field family_name_not_stemmed type string { + indexing: input family_name_index | index + stemming: none + } + + field family_description_not_stemmed type string { + indexing: input family_description_index | index + stemming: none + } + document family_document { field search_weights_ref type reference { @@ -170,6 +180,19 @@ schema family_document { } match-features: name_score() description_score() } + + rank-profile exact_not_stemmed inherits default { + function name_score() { + expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed) + } + function description_score() { + expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } rank-profile hybrid_no_closeness inherits default { function name_score() { @@ -199,6 +222,40 @@ schema family_document { } match-features: name_score() description_score() } + + rank-profile hybrid_no_description_embedding inherits default { + inputs { + query(query_embedding) tensor(x[768]) + } + function name_score() { + expression: attribute(name_weight) * bm25(family_name_index) + } + function description_score() { + expression: attribute(description_weight) * bm25(family_description_index) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } + + rank-profile hybrid_custom_weight inherits default { + inputs { + query(query_embedding) tensor(x[768]) + query(bm25_weight) double + } + function name_score() { + expression: attribute(name_weight) * bm25(family_name_index) + } + function description_score() { + expression: attribute(description_weight) * bm25(family_description_index) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } + document-summary search_summary { summary family_name {} @@ -223,4 +280,39 @@ schema family_document { summary collection_title {} summary collection_summary {} } + + document-summary search_summary_with_tokens { + summary family_name {} + summary family_description {} + summary family_import_id {} + summary family_slug {} + summary family_category {} + summary family_publication_ts {} + summary family_geography {} + summary family_geographies {} + summary family_source {} + summary document_import_id {} + summary document_title {} + summary document_slug {} + summary document_languages {} + summary document_content_type {} + summary document_cdn_object {} + summary document_source_url {} + summary metadata {} + summary corpus_import_id {} + summary corpus_type_name {} + summary collection_title {} + summary collection_summary {} + summary family_name_index {} + summary family_name_index_tokens { + source: family_name_index + tokens + } + summary family_description_index {} + summary family_description_index_tokens { + source: family_description_index + tokens + } + from-disk + } } From a941cc19b9487147a1010d9a359de91674136d7a Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:01:33 +0000 Subject: [PATCH 13/26] Bump to 1.19.16 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 849d3b4b..b5593f7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.15" +version = "1.19.16" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] From 7dbf911a4587fa00a51c4cfd13b294bbcfa8a245 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:24:26 +0000 Subject: [PATCH 14/26] Fix docstring param name --- app/repository/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/repository/download.py b/app/repository/download.py index 33592cc0..85dd1c47 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -20,7 +20,7 @@ def get_whole_database_dump( """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] corpora_ids: The corpora from which we + :param list[str] allowed_corpora_ids: The corpora from which we should allow the data to be dumped. :return pd.DataFrame: A DataFrame containing the results of the SQL query that gets the whole database dump in our desired format. From b53e8a46ac0c12594b92de34bc62161a2def8581 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:40:22 +0000 Subject: [PATCH 15/26] Fix capitalisation --- app/repository/sql/download.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 15bbf8ac..68f77685 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -227,7 +227,7 @@ SELECT n3.event_type_names AS "Full timeline of events (types)", n3.event_dates AS "Full timeline of events (dates)", d.created::DATE AS "Date Added to System", - f.last_modified::DATE AS "Last ModIFied on System", + f.last_modified::DATE AS "Last Modified on System", d.import_id AS "Internal Document ID", f.import_id AS "Internal Family ID", n1.collection_import_ids AS "Internal Collection ID(s)", From a9bc84edeedbac69b224e6381a57f37a62eade9b Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:49:32 +0000 Subject: [PATCH 16/26] Fix capitalisation for MCF category --- app/repository/sql/download.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 68f77685..c5ab3ab4 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -237,6 +237,7 @@ SELECT type,0}') AS "Document Type", CASE WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + WHEN f.family_category = 'MCF' THEN 'MCF' ELSE INITCAP(f.family_category::TEXT) END AS "Category", ARRAY_TO_STRING( From 6ae9ee92f2fbdd477b44962898f5c19006bb529d Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:50:14 +0000 Subject: [PATCH 17/26] Fix world map query counts --- app/repository/sql/world_map.sql | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql index 51307212..42db7a4d 100644 --- a/app/repository/sql/world_map.sql +++ b/app/repository/sql/world_map.sql @@ -5,11 +5,16 @@ WITH counts AS ( COUNT(*) AS records_count FROM family + INNER JOIN + family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id INNER JOIN family_geography ON family.import_id = family_geography.family_import_id WHERE - CASE + family_corpus.corpus_import_id = ANY(:allowed_corpora_ids) + AND CASE WHEN ( NOT ( EXISTS ( From 10b9dec1678c1dc4e9b63adadda511ceb00fa1be Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:02:52 +0000 Subject: [PATCH 18/26] Update world map stats repo function --- app/repository/geography.py | 110 ++++++++---------------------------- 1 file changed, 25 insertions(+), 85 deletions(-) diff --git a/app/repository/geography.py b/app/repository/geography.py index b035fd70..112257a5 100644 --- a/app/repository/geography.py +++ b/app/repository/geography.py @@ -1,22 +1,19 @@ """Functions to support the geographies endpoint.""" import logging +import os from typing import Optional, Sequence -from db_client.models.dfce.family import ( - Family, - FamilyDocument, - FamilyGeography, - FamilyStatus, -) +from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography from db_client.models.dfce.geography import Geography -from sqlalchemy import func -from sqlalchemy.dialects import postgresql +from sqlalchemy import bindparam, text from sqlalchemy.exc import OperationalError from sqlalchemy.orm import Query, Session +from sqlalchemy.types import ARRAY, String -from app.errors import RepositoryError +from app.errors import RepositoryError, ValidationError from app.models.geography import GeographyStatsDTO +from app.repository.helpers import get_query_template _LOGGER = logging.getLogger(__file__) @@ -66,85 +63,30 @@ def get_geo_subquery( def _db_count_fams_in_category_and_geo( db: Session, allowed_corpora: Optional[list[str]] -) -> Query: - """ - Query the database for the fam count per category per geo. - - NOTE: SqlAlchemy will make a complete hash of query generation if - columns are used in the query() call. Therefore, entire objects are - returned. +) -> list[GeographyStatsDTO]: + """Query the database for the family count per category per geo. :param Session db: DB Session to perform query on. :param Optional[list[str]] allowed_corpora: The list of allowed corpora IDs to filter on. - :return Query: A Query object containing the queries to perform. + :return list[GeographyStatsDTO]: A list of counts of families by + category per geography. """ - # Get the required Geography information and cross join each with all of the unique - # family_category values (so if some geographies have no documents for a particular - # family_category, we can set the count for that category to 0). - family_categories = db.query(Family.family_category).distinct().subquery() - geo_family_combinations = db.query( - Geography.id.label("geography_id"), - Geography.display_value, - Geography.slug, - Geography.value, - family_categories.c.family_category, - ).subquery("geo_family_combinations") - - # Get a count of documents in each present family_category for each geography. - counts = ( - db.query( - Family.family_category, - FamilyGeography.geography_id, - func.count().label("records_count"), - ) - .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id) - # .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id) - # .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id) - .filter(Family.family_status == FamilyStatus.PUBLISHED) - .group_by(Family.family_category, FamilyGeography.geography_id) - .subquery("counts") + if allowed_corpora in [None, []]: + raise ValidationError("No allowed corpora provided") + + query_template = text( + get_query_template(os.path.join("app", "repository", "sql", "world_map.sql")) ) - # if allowed_corpora is not None: - # counts = counts.where(Corpus.import_id.in_(allowed_corpora)) - # counts = counts.group_by( - # Family.family_category, FamilyGeography.geography_id - # ).subquery("counts") - # _LOGGER.info(counts) - - # Aggregate family_category counts per geography into a JSONB object, and if a - # family_category count is missing, set the count for that category to 0 so each - # geography will always have a count for all family_category values. - query = ( - db.query( - geo_family_combinations.c.display_value.label("display_value"), - geo_family_combinations.c.slug.label("slug"), - geo_family_combinations.c.value.label("value"), - func.jsonb_object_agg( - geo_family_combinations.c.family_category, - func.coalesce(counts.c.records_count, 0), - ).label("counts"), - ) - .select_from( - geo_family_combinations.join( - counts, - (geo_family_combinations.c.geography_id == counts.c.geography_id) - & ( - geo_family_combinations.c.family_category - == counts.c.family_category - ), - isouter=True, - ) - ) - .group_by( - geo_family_combinations.c.display_value, - geo_family_combinations.c.slug, - geo_family_combinations.c.value, - ) - .order_by(geo_family_combinations.c.display_value) + query_template = query_template.bindparams( + bindparam("allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)), ) - print(query.statement.compile(dialect=postgresql.dialect())) - return query + + family_geo_stats = db.execute( + query_template, {"allowed_corpora_ids": allowed_corpora} + ).all() + results = [_to_dto(fgs) for fgs in family_geo_stats] + return results def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO: @@ -176,13 +118,11 @@ def get_world_map_stats( :return list[GeographyStatsDTO]: A list of Geography stats objects """ try: - family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora).all() + family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora) except OperationalError as e: _LOGGER.error(e) raise RepositoryError("Error querying the database for geography stats") if not family_geo_stats: return [] - - result = [_to_dto(fgs) for fgs in family_geo_stats] - return result + return family_geo_stats From b2030b40a93c8af47c82a10632733873b9fa6320 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:08:05 +0000 Subject: [PATCH 19/26] Move get_world_map_stats to service layer --- app/repository/geography.py | 33 +++++--------------------------- app/service/geography.py | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 28 deletions(-) create mode 100644 app/service/geography.py diff --git a/app/repository/geography.py b/app/repository/geography.py index 112257a5..3675b8b4 100644 --- a/app/repository/geography.py +++ b/app/repository/geography.py @@ -7,11 +7,10 @@ from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography from db_client.models.dfce.geography import Geography from sqlalchemy import bindparam, text -from sqlalchemy.exc import OperationalError from sqlalchemy.orm import Query, Session from sqlalchemy.types import ARRAY, String -from app.errors import RepositoryError, ValidationError +from app.errors import ValidationError from app.models.geography import GeographyStatsDTO from app.repository.helpers import get_query_template @@ -61,14 +60,14 @@ def get_geo_subquery( return geo_subquery.subquery("geo_subquery") -def _db_count_fams_in_category_and_geo( - db: Session, allowed_corpora: Optional[list[str]] +def _count_families_per_category_in_each_geo( + db: Session, allowed_corpora: list[str] ) -> list[GeographyStatsDTO]: """Query the database for the family count per category per geo. :param Session db: DB Session to perform query on. - :param Optional[list[str]] allowed_corpora: The list of allowed - corpora IDs to filter on. + :param list[str] allowed_corpora: The list of allowed corpora IDs to + filter on. :return list[GeographyStatsDTO]: A list of counts of families by category per geography. """ @@ -104,25 +103,3 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO: slug=family_doc_geo_stats.slug, family_counts=family_doc_geo_stats.counts, ) - - -def get_world_map_stats( - db: Session, allowed_corpora: Optional[list[str]] -) -> list[GeographyStatsDTO]: - """ - Get a count of fam per category per geography for all geographies. - - :param db Session: The database session. - :param Optional[list[str]] allowed_corpora: The list of allowed - corpora IDs to filter on. - :return list[GeographyStatsDTO]: A list of Geography stats objects - """ - try: - family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora) - except OperationalError as e: - _LOGGER.error(e) - raise RepositoryError("Error querying the database for geography stats") - - if not family_geo_stats: - return [] - return family_geo_stats diff --git a/app/service/geography.py b/app/service/geography.py new file mode 100644 index 00000000..244bac0c --- /dev/null +++ b/app/service/geography.py @@ -0,0 +1,38 @@ +"""Functions to support the geographies endpoint.""" + +import logging +from typing import Optional + +from sqlalchemy.exc import OperationalError +from sqlalchemy.orm import Session + +from app.errors import RepositoryError, ValidationError +from app.models.geography import GeographyStatsDTO +from app.repository.geography import _count_families_per_category_in_each_geo + +_LOGGER = logging.getLogger(__file__) + + +def get_world_map_stats( + db: Session, allowed_corpora: Optional[list[str]] +) -> list[GeographyStatsDTO]: + """ + Get a count of fam per category per geography for all geographies. + + :param db Session: The database session. + :param Optional[list[str]] allowed_corpora: The list of allowed + corpora IDs to filter on. + :return list[GeographyStatsDTO]: A list of Geography stats objects + """ + if allowed_corpora is None or allowed_corpora == []: + raise ValidationError("No allowed corpora provided") + + try: + family_geo_stats = _count_families_per_category_in_each_geo(db, allowed_corpora) + except OperationalError as e: + _LOGGER.error(e) + raise RepositoryError("Error querying the database for geography stats") + + if not family_geo_stats: + return [] + return family_geo_stats From e473a433ad2a391f4149903f2fcfd8e347c0232f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:09:35 +0000 Subject: [PATCH 20/26] Rename function and add world map service --- app/repository/geography.py | 2 +- app/service/{geography.py => world_map.py} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename app/service/{geography.py => world_map.py} (86%) diff --git a/app/repository/geography.py b/app/repository/geography.py index 3675b8b4..5b8c10b6 100644 --- a/app/repository/geography.py +++ b/app/repository/geography.py @@ -60,7 +60,7 @@ def get_geo_subquery( return geo_subquery.subquery("geo_subquery") -def _count_families_per_category_in_each_geo( +def count_families_per_category_in_each_geo( db: Session, allowed_corpora: list[str] ) -> list[GeographyStatsDTO]: """Query the database for the family count per category per geo. diff --git a/app/service/geography.py b/app/service/world_map.py similarity index 86% rename from app/service/geography.py rename to app/service/world_map.py index 244bac0c..dc8735ce 100644 --- a/app/service/geography.py +++ b/app/service/world_map.py @@ -8,7 +8,7 @@ from app.errors import RepositoryError, ValidationError from app.models.geography import GeographyStatsDTO -from app.repository.geography import _count_families_per_category_in_each_geo +from app.repository.geography import count_families_per_category_in_each_geo _LOGGER = logging.getLogger(__file__) @@ -28,7 +28,7 @@ def get_world_map_stats( raise ValidationError("No allowed corpora provided") try: - family_geo_stats = _count_families_per_category_in_each_geo(db, allowed_corpora) + family_geo_stats = count_families_per_category_in_each_geo(db, allowed_corpora) except OperationalError as e: _LOGGER.error(e) raise RepositoryError("Error querying the database for geography stats") From 83dc43e339f80493bde116a235cc89b3de6285a3 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:10:45 +0000 Subject: [PATCH 21/26] Import from world map service --- app/api/api_v1/routers/geographies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/geographies.py index b92084d3..1ec466df 100644 --- a/app/api/api_v1/routers/geographies.py +++ b/app/api/api_v1/routers/geographies.py @@ -6,8 +6,8 @@ from app.clients.db.session import get_db from app.errors import RepositoryError from app.models.geography import GeographyStatsDTO -from app.repository.geography import get_world_map_stats from app.service.custom_app import AppTokenFactory +from app.service.world_map import get_world_map_stats _LOGGER = logging.getLogger(__file__) @@ -18,7 +18,7 @@ async def geographies( request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ): - """Get a summary of fam stats for all geographies for world map.""" + """Get a summary of family stats for all geographies for world map.""" _LOGGER.info( "Getting world map counts for all geographies", extra={ From ab8c7019814d180a3989a723136f5f4cbe37b67d Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:46:46 +0000 Subject: [PATCH 22/26] Update counts --- .../routers/geographies/test_world_map_summary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py index c60c6d48..be24ff3a 100644 --- a/tests/non_search/routers/geographies/test_world_map_summary.py +++ b/tests/non_search/routers/geographies/test_world_map_summary.py @@ -85,7 +85,7 @@ def test_endpoint_returns_ok_all_docs_per_family_published( @pytest.mark.parametrize( ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), [ - ("India", 0, 0, 2), + ("India", 1, 1, 3), ("Afghanistan", 0, 0, 2), ], ) @@ -152,7 +152,7 @@ def test_endpoint_returns_503_when_error(data_client, valid_token): @pytest.mark.parametrize( ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), [ - ("India", 0, 0, 1), + ("India", 1, 1, 1), ("Afghanistan", 0, 0, 1), ], ) @@ -191,7 +191,7 @@ def test_endpoint_returns_different_results_with_alt_token( .filter(Geography.display_value == geo_display_value) .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id) .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id) - .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id) + .join(Corpus, Corpus.import_id == FamilyCorpus.corpus_import_id) .join(Geography, Geography.id == FamilyGeography.geography_id) .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000") .all() From 9078770b61f123d5fc8b08404ef3686d23a7e81e Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:48:48 +0000 Subject: [PATCH 23/26] Fix counts --- tests/non_search/routers/geographies/test_world_map_summary.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py index be24ff3a..a53e1bdc 100644 --- a/tests/non_search/routers/geographies/test_world_map_summary.py +++ b/tests/non_search/routers/geographies/test_world_map_summary.py @@ -152,7 +152,7 @@ def test_endpoint_returns_503_when_error(data_client, valid_token): @pytest.mark.parametrize( ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"), [ - ("India", 1, 1, 1), + ("India", 0, 0, 1), ("Afghanistan", 0, 0, 1), ], ) @@ -175,7 +175,6 @@ def test_endpoint_returns_different_results_with_alt_token( .one() ) assert fam - print(fam) resp_json = _make_world_map_lookup_request(data_client, alternative_token) assert len(resp_json) > 1 From b2a3671d7e2bdc8d9c35da1cbade8e4312fa186d Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:04:36 +0000 Subject: [PATCH 24/26] Rename router world map --- .../api_v1/routers/{geographies.py => world_map.py} | 10 +++++++--- app/main.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) rename app/api/api_v1/routers/{geographies.py => world_map.py} (80%) diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/world_map.py similarity index 80% rename from app/api/api_v1/routers/geographies.py rename to app/api/api_v1/routers/world_map.py index 1ec466df..219b14ff 100644 --- a/app/api/api_v1/routers/geographies.py +++ b/app/api/api_v1/routers/world_map.py @@ -4,17 +4,17 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from app.clients.db.session import get_db -from app.errors import RepositoryError +from app.errors import RepositoryError, ValidationError from app.models.geography import GeographyStatsDTO from app.service.custom_app import AppTokenFactory from app.service.world_map import get_world_map_stats _LOGGER = logging.getLogger(__file__) -geographies_router = APIRouter() +world_map_router = APIRouter() -@geographies_router.get("/geographies", response_model=list[GeographyStatsDTO]) +@world_map_router.get("/geographies", response_model=list[GeographyStatsDTO]) async def geographies( request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ): @@ -42,6 +42,10 @@ async def geographies( return world_map_stats except RepositoryError as e: + _LOGGER.error(e) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=e.message ) + except ValidationError as e: + _LOGGER.error(e) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=e.message) diff --git a/app/main.py b/app/main.py index e3e1296f..09438b37 100644 --- a/app/main.py +++ b/app/main.py @@ -16,11 +16,11 @@ from app.api.api_v1.routers.admin import admin_document_router from app.api.api_v1.routers.auth import auth_router from app.api.api_v1.routers.documents import documents_router -from app.api.api_v1.routers.geographies import geographies_router from app.api.api_v1.routers.lookups import lookups_router from app.api.api_v1.routers.pipeline_trigger import pipeline_trigger_router from app.api.api_v1.routers.search import search_router from app.api.api_v1.routers.summaries import summary_router +from app.api.api_v1.routers.world_map import world_map_router from app.clients.db.session import SessionLocal, engine from app.service.auth import get_superuser_details from app.service.health import is_database_online @@ -158,7 +158,7 @@ async def root(): summary_router, prefix="/api/v1", tags=["Summaries"], include_in_schema=False ) app.include_router( - geographies_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False + world_map_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False ) # add pagination support to all routes that ask for it From 1df2e9e08fc06717d275cc1da51250efd946026e Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:44:49 +0000 Subject: [PATCH 25/26] PR comment fixes --- .../non_search/routers/geographies/test_world_map_summary.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py index a53e1bdc..3155a94b 100644 --- a/tests/non_search/routers/geographies/test_world_map_summary.py +++ b/tests/non_search/routers/geographies/test_world_map_summary.py @@ -50,7 +50,6 @@ def test_endpoint_returns_ok_all_docs_per_family_published( expected_unfccc, valid_token, ): - """Check endpoint returns 200 on success""" setup_all_docs_published_world_map(data_db) resp_json = _make_world_map_lookup_request(data_client, valid_token) @@ -165,7 +164,7 @@ def test_endpoint_returns_different_results_with_alt_token( expected_unfccc, alternative_token, ): - """Check endpoint returns 200 & only counts CCLW docs""" + """Check endpoint returns 200 & only counts UNFCCC docs""" setup_all_docs_published_world_map(data_db) fam = ( @@ -195,7 +194,6 @@ def test_endpoint_returns_different_results_with_alt_token( .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000") .all() ) - print(fams) assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES assert sum(resp["family_counts"].values()) == len(fams) From eeeb7e109d3ca26514f347ed9e1533eaa707d42f Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:45:10 +0000 Subject: [PATCH 26/26] Make representative of entity --- app/api/api_v1/routers/world_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/api/api_v1/routers/world_map.py b/app/api/api_v1/routers/world_map.py index 219b14ff..78e7fa4c 100644 --- a/app/api/api_v1/routers/world_map.py +++ b/app/api/api_v1/routers/world_map.py @@ -15,10 +15,10 @@ @world_map_router.get("/geographies", response_model=list[GeographyStatsDTO]) -async def geographies( +async def world_map_stats( request: Request, app_token: Annotated[str, Header()], db=Depends(get_db) ): - """Get a summary of family stats for all geographies for world map.""" + """Get a summary of family counts for all geographies for world map.""" _LOGGER.info( "Getting world map counts for all geographies", extra={