From 277e67430e3ec6c80ec07503ae997a9c0569b89f Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Mon, 4 Nov 2024 15:06:59 +0000
Subject: [PATCH 01/26] Make documents router dependent on app token

---
 app/api/api_v1/routers/documents.py           |  20 +-
 app/api/api_v1/routers/search.py              |   2 -
 app/repository/document.py                    |  27 +-
 .../routers/documents/setup_doc_fam_lookup.py |  24 +
 .../documents/test_admin_doc_routes.py        | 936 +++++++++++++++++
 .../documents/test_document_families.py       | 973 +-----------------
 .../documents/test_get_document_families.py   | 132 +--
 7 files changed, 1070 insertions(+), 1044 deletions(-)
 create mode 100644 tests/non_search/routers/documents/setup_doc_fam_lookup.py
 create mode 100644 tests/non_search/routers/documents/test_admin_doc_routes.py

diff --git a/app/api/api_v1/routers/documents.py b/app/api/api_v1/routers/documents.py
index 0cf92c19..1d48c278 100644
--- a/app/api/api_v1/routers/documents.py
+++ b/app/api/api_v1/routers/documents.py
@@ -1,8 +1,8 @@
 import logging
 from http.client import NOT_FOUND
-from typing import Union
+from typing import Annotated, Union
 
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, Header, HTTPException, Request
 
 from app.clients.db.session import get_db
 from app.models.document import (
@@ -14,6 +14,7 @@
     get_family_document_and_context,
     get_slugged_objects,
 )
+from app.service.custom_app import AppTokenFactory
 
 _LOGGER = logging.getLogger(__file__)
 
@@ -28,20 +29,23 @@
     ],
 )
 async def family_or_document_detail(
-    slug: str,
-    db=Depends(get_db),
+    slug: str, request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
 ):
     """Get details of the family or document associated with the slug."""
     _LOGGER.info(
         f"Getting detailed information for family or document '{slug}'",
         extra={
-            "props": {
-                "import_id_or_slug": slug,
-            },
+            "props": {"import_id_or_slug": slug, "app_token": str(app_token)},
         },
     )
 
-    family_document_import_id, family_import_id = get_slugged_objects(db, slug)
+    # Decode the app token and validate it.
+    token = AppTokenFactory()
+    token.decode_and_validate(db, request, app_token)
+
+    family_document_import_id, family_import_id = get_slugged_objects(
+        db, slug, token.allowed_corpora_ids
+    )
     if family_document_import_id is None and family_import_id is None:
         raise HTTPException(status_code=NOT_FOUND, detail=f"Nothing found for {slug}")
 
diff --git a/app/api/api_v1/routers/search.py b/app/api/api_v1/routers/search.py
index cb5ebe87..a7749c33 100644
--- a/app/api/api_v1/routers/search.py
+++ b/app/api/api_v1/routers/search.py
@@ -277,8 +277,6 @@ def download_all_search_documents(
     request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
 ) -> RedirectResponse:
     """Download a CSV containing details of all the documents in the corpus."""
-    token = AppTokenFactory()
-
     _LOGGER.info(
         "Whole data download request",
         extra={
diff --git a/app/repository/document.py b/app/repository/document.py
index 519735dd..c97d9744 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -40,20 +40,27 @@
 _LOGGER = logging.getLogger(__file__)
 
 
-def get_slugged_objects(db: Session, slug: str) -> tuple[Optional[str], Optional[str]]:
-    """
-    Matches the slug name to a FamilyDocument or Family import_id
+def get_slugged_objects(
+    db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
+) -> tuple[Optional[str], Optional[str]]:
+    """Match the slug name to a FamilyDocument or Family import ID.
 
     :param Session db: connection to db
     :param str slug: slug name to match
-    :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or
-    the Family import_id
+    :param Optional[list[str]] allowed_corpora: The corpora IDs to look
+        for the slugged object in.
+    :return tuple[Optional[str], Optional[str]]: the FamilyDocument
+        import id or the Family import_id.
     """
-    result = (
-        db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
-            Slug.name == slug
-        )
-    ).one_or_none()
+    query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
+        Slug.name == slug
+    )
+
+    # # Only get slug for the fam/doc if it belongs to the list of allowed corpora.
+    # if allowed_corpora is not None:
+    #     query = query.filter()
+
+    result = query.one_or_none()
     if result is None:
         return (None, None)
     return result
diff --git a/tests/non_search/routers/documents/setup_doc_fam_lookup.py b/tests/non_search/routers/documents/setup_doc_fam_lookup.py
new file mode 100644
index 00000000..7530e88c
--- /dev/null
+++ b/tests/non_search/routers/documents/setup_doc_fam_lookup.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+from fastapi import status
+
+DOCUMENTS_ENDPOINT = "/api/v1/documents"
+TEST_HOST = "http://localhost:3000/"
+
+
+def _make_doc_fam_lookup_request(
+    client,
+    token,
+    slug: str,
+    expected_status_code: int = status.HTTP_200_OK,
+    origin: Optional[str] = TEST_HOST,
+):
+    headers = (
+        {"app-token": token}
+        if origin is None
+        else {"app-token": token, "origin": origin}
+    )
+
+    response = client.get(f"{DOCUMENTS_ENDPOINT}/{slug}", headers=headers)
+    assert response.status_code == expected_status_code, response.text
+    return response.json()
diff --git a/tests/non_search/routers/documents/test_admin_doc_routes.py b/tests/non_search/routers/documents/test_admin_doc_routes.py
new file mode 100644
index 00000000..4674620c
--- /dev/null
+++ b/tests/non_search/routers/documents/test_admin_doc_routes.py
@@ -0,0 +1,936 @@
+import logging
+from typing import Optional
+
+import pytest
+from db_client.models.dfce.family import FamilyDocument
+from db_client.models.document.physical_document import (
+    Language,
+    LanguageSource,
+    PhysicalDocumentLanguage,
+)
+from fastapi import status
+from fastapi.testclient import TestClient
+from sqlalchemy.orm import Session
+
+from tests.non_search.setup_helpers import (
+    setup_docs_with_two_orgs,
+    setup_docs_with_two_orgs_no_langs,
+    setup_with_two_docs,
+    setup_with_two_docs_bad_ids,
+    setup_with_two_unpublished_docs,
+)
+
+
+def test_update_document_status__is_secure(
+    data_client: TestClient,
+    data_db: Session,
+):
+    setup_with_two_docs(data_db)
+
+    import_id = "CCLW.executive.1.2"
+    response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed")
+    assert response.status_code == 401
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.12",
+        "UNFCCC.s.ill.y.2.2",
+    ],
+)
+def test_update_document_status__fails_on_non_matching_import_id(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    setup_with_two_docs_bad_ids(data_db)
+
+    response = data_client.post(
+        f"/api/v1/admin/documents/{import_id}/processed",
+        headers=data_superuser_token_headers,
+    )
+
+    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
+
+
+def test_update_document_status__publishes_document(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+):
+    """Test that we can send a payload to the backend to update family_document.document_status"""
+    setup_with_two_unpublished_docs(data_db)
+    UPDATE_IMPORT_ID = "CCLW.executive.1.2"
+    UNCHANGED_IMPORT_ID = "CCLW.executive.2.2"
+
+    # State of db beforehand
+    pre_family_status = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID)
+        .one()
+        .document_status
+    )
+
+    response = data_client.post(
+        f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed",
+        headers=data_superuser_token_headers,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+
+    assert json_object["import_id"] == UPDATE_IMPORT_ID
+    assert json_object["document_status"] == "Published"
+
+    # Now Check the db
+    updated_family = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID)
+        .one()
+    )
+    assert updated_family.document_status == "Published"
+    assert updated_family.document_status != pre_family_status
+
+    unchanged_family = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID)
+        .one()
+    )
+    assert unchanged_family.document_status == "Deleted"
+
+
+def test_update_document__is_secure(
+    data_client: TestClient,
+    data_db: Session,
+):
+    setup_with_two_docs(data_db)
+
+    import_id = "CCLW.executive.1.2"
+    payload = {
+        "md5sum": "abc123",
+        "content_type": "content_type",
+        "source_url": "source_url",
+    }
+
+    response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload)
+
+    assert response.status_code == 401
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.12",
+        "UNFCCC.s.ill.y.2.2",
+    ],
+)
+def test_update_document__fails_on_non_matching_import_id(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    setup_with_two_docs_bad_ids(data_db)
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__works_on_import_id(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    setup_docs_with_two_orgs(data_db)
+
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__works_on_new_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    """Send two payloads in series to assert that languages are additive and we don't remove existing languages."""
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # ADD THE FIRST LANGUAGE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["eng"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == ["eng"]
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(languages) == 1
+    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
+    assert lang.language_code == "eng"
+
+    # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["fra"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    expected_languages = ["eng", "fra"]
+    assert json_object["languages"] == expected_languages
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    doc_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(doc_languages) == 2
+    for doc_lang in doc_languages:
+        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
+        assert lang.language_code in expected_languages
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__works_on_new_iso_639_1_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    """Send two payloads in series to assert that languages are additive and we don't remove existing languages."""
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # ADD THE FIRST LANGUAGE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["bo"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == ["bod"]
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(languages) == 1
+    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
+    assert lang.language_code == "bod"
+
+    # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["el"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    expected_languages = set(["ell", "bod"])
+    assert set(json_object["languages"]) == expected_languages
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    doc_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(doc_languages) == 2
+    for doc_lang in doc_languages:
+        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
+        assert lang.language_code in expected_languages
+
+
+# TODO Parametrize this test with the two languages as parameters
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__works_on_existing_iso_639_1_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    """
+    Assert that we can skip over existing languages for documents when using two letter iso codes.
+
+    Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a
+    language entry for that physical document we don't throw an error. This proves that we can detect that the
+    two-letter iso code language already exists.
+    """
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # ADD THE FIRST LANGUAGE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["bod"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == ["bod"]
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(languages) == 1
+    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
+    assert lang.language_code == "bod"
+
+    # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["bo"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    expected_languages = ["bod"]
+    assert json_object["languages"] == expected_languages
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    doc_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(doc_languages) == 1
+    for doc_lang in doc_languages:
+        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
+        assert lang.language_code in expected_languages
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__works_on_existing_iso_639_3_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+):
+    """
+    Assert that we can skip over existing languages for documents when using three letter iso codes.
+
+    Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a
+    language entry for that physical document we don't throw an error. This proves that we can detect that the
+    three-letter iso code language already exists.
+    """
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # ADD THE FIRST LANGUAGE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["bo"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == ["bod"]
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(languages) == 1
+    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
+    assert lang.language_code == "bod"
+
+    # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["bod"],
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    expected_languages = ["bod"]
+    assert json_object["languages"] == expected_languages
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    doc_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(doc_languages) == 1
+    for doc_lang in doc_languages:
+        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
+        assert lang.language_code in expected_languages
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+def test_update_document__logs_warning_on_four_letter_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+    caplog,
+):
+    """Send a payload to assert that languages that are too long aren't added and a warning is logged."""
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # Payload with a four letter language code that won't exist in the db
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": ["boda"],
+    }
+
+    with caplog.at_level(logging.WARNING):
+        response = data_client.put(
+            f"/api/v1/admin/documents/{import_id}",
+            headers=data_superuser_token_headers,
+            json=payload,
+        )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == []
+
+    assert (
+        "Retrieved no language from database for meta_data object language"
+        in caplog.text
+    )
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(languages) == 0
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+@pytest.mark.parametrize(
+    "languages",
+    [[], None],
+)
+def test_update_document__works_with_no_language(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+    languages: Optional[list[str]],
+):
+    """Test that we can send a payload to the backend with no languages to assert that none are added."""
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    # ADD THE FIRST LANGUAGE
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": languages,
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+    assert json_object["languages"] == []
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    db_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(db_languages) == 0
+    assert set([lang.language_id for lang in db_languages]) == set()
+
+
+@pytest.mark.parametrize(
+    "import_id",
+    [
+        "CCLW.executive.1.2",
+        "UNFCCC.non-party.2.2",
+    ],
+)
+@pytest.mark.parametrize(
+    "existing_languages",
+    [[], ["eng"], ["aaa"], ["aaa", "aab"]],
+)
+def test_update_document__works_existing_languages(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+    import_id: str,
+    existing_languages: list[str],
+):
+    """Test that we can send a payload to the backend with multiple languages to assert that both are added."""
+    setup_docs_with_two_orgs_no_langs(data_db)
+
+    for lang_code in existing_languages:
+        existing_doc = (
+            data_db.query(FamilyDocument)
+            .filter(FamilyDocument.import_id == import_id)
+            .one()
+            .physical_document
+        )
+        existing_lang = (
+            data_db.query(Language).filter(Language.language_code == lang_code).one()
+        )
+        existing_doc_lang = PhysicalDocumentLanguage(
+            language_id=existing_lang.id,
+            document_id=existing_doc.id,
+            source=LanguageSource.MODEL,
+        )
+        data_db.add(existing_doc_lang)
+        data_db.flush()
+        data_db.commit()
+
+    languages_to_add = ["eng", "fra"]
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+        "languages": languages_to_add,
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+
+    expected_languages = set(languages_to_add + existing_languages)
+    assert set(json_object["languages"]) == expected_languages
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+    doc_languages = (
+        data_db.query(PhysicalDocumentLanguage)
+        .filter(PhysicalDocumentLanguage.document_id == doc.id)
+        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
+        .all()
+    )
+    assert len(doc_languages) == len(expected_languages)
+    for doc_lang in doc_languages:
+        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
+        assert lang.language_code in expected_languages
+
+
+def test_update_document__idempotent(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+):
+    setup_with_two_docs(data_db)
+
+    import_id = "CCLW.executive.1.2"
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+    assert response.status_code == 200
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{import_id}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+
+    # Now Check the db
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+
+def test_update_document__works_on_slug(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+):
+    setup_with_two_docs(data_db)
+
+    slug = "DocSlug1"
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+    }
+
+    response = data_client.put(
+        f"/api/v1/admin/documents/{slug}",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == 200
+    json_object = response.json()
+    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert json_object["content_type"] == "updated/content_type"
+    assert json_object["cdn_object"] == "folder/file"
+
+    # Now Check the db
+    import_id = "CCLW.executive.1.2"
+    doc = (
+        data_db.query(FamilyDocument)
+        .filter(FamilyDocument.import_id == import_id)
+        .one()
+        .physical_document
+    )
+    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
+    assert doc.content_type == "updated/content_type"
+    assert doc.cdn_object == "folder/file"
+
+
+def test_update_document__status_422_when_not_found(
+    data_client: TestClient,
+    data_superuser_token_headers: dict[str, str],
+    data_db: Session,
+):
+    setup_with_two_docs(data_db)
+
+    payload = {
+        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
+        "content_type": "updated/content_type",
+        "cdn_object": "folder/file",
+    }
+
+    response = data_client.put(
+        "/api/v1/admin/documents/nothing",
+        headers=data_superuser_token_headers,
+        json=payload,
+    )
+
+    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py
index c46bc4a3..8478fc50 100644
--- a/tests/non_search/routers/documents/test_document_families.py
+++ b/tests/non_search/routers/documents/test_document_families.py
@@ -1,999 +1,48 @@
-import logging
-from typing import Optional
-
-import pytest
-from db_client.models.dfce.family import FamilyDocument
-from db_client.models.document.physical_document import (
-    Language,
-    LanguageSource,
-    PhysicalDocumentLanguage,
-)
-from fastapi import status
+from db_client.models.document.physical_document import PhysicalDocumentLanguage
 from fastapi.testclient import TestClient
 from sqlalchemy import update
 from sqlalchemy.orm import Session
 
+from tests.non_search.routers.documents.setup_doc_fam_lookup import (
+    _make_doc_fam_lookup_request,
+)
 from tests.non_search.setup_helpers import (
-    setup_docs_with_two_orgs,
-    setup_docs_with_two_orgs_no_langs,
     setup_with_two_docs,
-    setup_with_two_docs_bad_ids,
     setup_with_two_docs_multiple_languages,
-    setup_with_two_unpublished_docs,
 )
 
 
-def test_physical_doc_languages(
-    data_client: TestClient,
-    data_db: Session,
-):
+def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token):
     setup_with_two_docs(data_db)
 
-    response = data_client.get(
-        "/api/v1/documents/DocSlug1",
-    )
-    json_response = response.json()
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
     document = json_response["document"]
-
-    assert response.status_code == 200
     print(json_response)
     assert document["languages"] == ["eng"]
 
-    response = data_client.get(
-        "/api/v1/documents/DocSlug2",
-    )
-    json_response = response.json()
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2")
     document = json_response["document"]
-
-    assert response.status_code == 200
     assert document["languages"] == []
 
 
 def test_physical_doc_languages_not_visible(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs(data_db)
     data_db.execute(update(PhysicalDocumentLanguage).values(visible=False))
 
-    response = data_client.get(
-        "/api/v1/documents/DocSlug1",
-    )
-    json_response = response.json()
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
     document = json_response["document"]
-
-    assert response.status_code == 200
     print(json_response)
     assert document["languages"] == []
 
 
 def test_physical_doc_multiple_languages(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs_multiple_languages(data_db)
 
-    response = data_client.get(
-        "/api/v1/documents/DocSlug1",
-    )
-    json_response = response.json()
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
     document = json_response["document"]
-
-    assert response.status_code == 200
     print(json_response)
     assert set(document["languages"]) == set(["fra", "eng"])
-
-
-def test_update_document_status__is_secure(
-    data_client: TestClient,
-    data_db: Session,
-):
-    setup_with_two_docs(data_db)
-
-    import_id = "CCLW.executive.1.2"
-    response = data_client.post(f"/api/v1/admin/documents/{import_id}/processed")
-    assert response.status_code == 401
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.12",
-        "UNFCCC.s.ill.y.2.2",
-    ],
-)
-def test_update_document_status__fails_on_non_matching_import_id(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    setup_with_two_docs_bad_ids(data_db)
-
-    response = data_client.post(
-        f"/api/v1/admin/documents/{import_id}/processed",
-        headers=data_superuser_token_headers,
-    )
-
-    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
-
-
-def test_update_document_status__publishes_document(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-):
-    """Test that we can send a payload to the backend to update family_document.document_status"""
-    setup_with_two_unpublished_docs(data_db)
-    UPDATE_IMPORT_ID = "CCLW.executive.1.2"
-    UNCHANGED_IMPORT_ID = "CCLW.executive.2.2"
-
-    # State of db beforehand
-    pre_family_status = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID)
-        .one()
-        .document_status
-    )
-
-    response = data_client.post(
-        f"/api/v1/admin/documents/{UPDATE_IMPORT_ID}/processed",
-        headers=data_superuser_token_headers,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-
-    assert json_object["import_id"] == UPDATE_IMPORT_ID
-    assert json_object["document_status"] == "Published"
-
-    # Now Check the db
-    updated_family = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == UPDATE_IMPORT_ID)
-        .one()
-    )
-    assert updated_family.document_status == "Published"
-    assert updated_family.document_status != pre_family_status
-
-    unchanged_family = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == UNCHANGED_IMPORT_ID)
-        .one()
-    )
-    assert unchanged_family.document_status == "Deleted"
-
-
-def test_update_document__is_secure(
-    data_client: TestClient,
-    data_db: Session,
-):
-    setup_with_two_docs(data_db)
-
-    import_id = "CCLW.executive.1.2"
-    payload = {
-        "md5sum": "abc123",
-        "content_type": "content_type",
-        "source_url": "source_url",
-    }
-
-    response = data_client.put(f"/api/v1/admin/documents/{import_id}", json=payload)
-
-    assert response.status_code == 401
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.12",
-        "UNFCCC.s.ill.y.2.2",
-    ],
-)
-def test_update_document__fails_on_non_matching_import_id(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    setup_with_two_docs_bad_ids(data_db)
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__works_on_import_id(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    setup_docs_with_two_orgs(data_db)
-
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__works_on_new_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    """Send two payloads in series to assert that languages are additive and we don't remove existing languages."""
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # ADD THE FIRST LANGUAGE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["eng"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == ["eng"]
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(languages) == 1
-    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
-    assert lang.language_code == "eng"
-
-    # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["fra"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    expected_languages = ["eng", "fra"]
-    assert json_object["languages"] == expected_languages
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    doc_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(doc_languages) == 2
-    for doc_lang in doc_languages:
-        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
-        assert lang.language_code in expected_languages
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__works_on_new_iso_639_1_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    """Send two payloads in series to assert that languages are additive and we don't remove existing languages."""
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # ADD THE FIRST LANGUAGE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["bo"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == ["bod"]
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(languages) == 1
-    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
-    assert lang.language_code == "bod"
-
-    # NOW ADD A NEW LANGUAGE TO CHECK THAT THE UPDATE IS ADDITIVE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["el"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    expected_languages = set(["ell", "bod"])
-    assert set(json_object["languages"]) == expected_languages
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    doc_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(doc_languages) == 2
-    for doc_lang in doc_languages:
-        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
-        assert lang.language_code in expected_languages
-
-
-# TODO Parametrize this test with the two languages as parameters
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__works_on_existing_iso_639_1_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    """
-    Assert that we can skip over existing languages for documents when using two letter iso codes.
-
-    Send two payloads in series to assert that if we add a 639 two letter iso code where there is already a
-    language entry for that physical document we don't throw an error. This proves that we can detect that the
-    two-letter iso code language already exists.
-    """
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # ADD THE FIRST LANGUAGE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["bod"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == ["bod"]
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(languages) == 1
-    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
-    assert lang.language_code == "bod"
-
-    # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["bo"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    expected_languages = ["bod"]
-    assert json_object["languages"] == expected_languages
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    doc_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(doc_languages) == 1
-    for doc_lang in doc_languages:
-        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
-        assert lang.language_code in expected_languages
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__works_on_existing_iso_639_3_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-):
-    """
-    Assert that we can skip over existing languages for documents when using three letter iso codes.
-
-    Send two payloads in series to assert that if we add a 639 three letter iso code where there is already a
-    language entry for that physical document we don't throw an error. This proves that we can detect that the
-    three-letter iso code language already exists.
-    """
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # ADD THE FIRST LANGUAGE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["bo"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == ["bod"]
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(languages) == 1
-    lang = data_db.query(Language).filter(Language.id == languages[0].language_id).one()
-    assert lang.language_code == "bod"
-
-    # NOW ADD THE SAME LANGUAGE AGAIN TO CHECK THAT THE UPDATE IS ADDITIVE AND WE SKIP OVER EXISTING LANGUAGES
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["bod"],
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    expected_languages = ["bod"]
-    assert json_object["languages"] == expected_languages
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    doc_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(doc_languages) == 1
-    for doc_lang in doc_languages:
-        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
-        assert lang.language_code in expected_languages
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-def test_update_document__logs_warning_on_four_letter_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-    caplog,
-):
-    """Send a payload to assert that languages that are too long aren't added and a warning is logged."""
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # Payload with a four letter language code that won't exist in the db
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": ["boda"],
-    }
-
-    with caplog.at_level(logging.WARNING):
-        response = data_client.put(
-            f"/api/v1/admin/documents/{import_id}",
-            headers=data_superuser_token_headers,
-            json=payload,
-        )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == []
-
-    assert (
-        "Retrieved no language from database for meta_data object language"
-        in caplog.text
-    )
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(languages) == 0
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-@pytest.mark.parametrize(
-    "languages",
-    [[], None],
-)
-def test_update_document__works_with_no_language(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-    languages: Optional[list[str]],
-):
-    """Test that we can send a payload to the backend with no languages to assert that none are added."""
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    # ADD THE FIRST LANGUAGE
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": languages,
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-    assert json_object["languages"] == []
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    db_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(db_languages) == 0
-    assert set([lang.language_id for lang in db_languages]) == set()
-
-
-@pytest.mark.parametrize(
-    "import_id",
-    [
-        "CCLW.executive.1.2",
-        "UNFCCC.non-party.2.2",
-    ],
-)
-@pytest.mark.parametrize(
-    "existing_languages",
-    [[], ["eng"], ["aaa"], ["aaa", "aab"]],
-)
-def test_update_document__works_existing_languages(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-    import_id: str,
-    existing_languages: list[str],
-):
-    """Test that we can send a payload to the backend with multiple languages to assert that both are added."""
-    setup_docs_with_two_orgs_no_langs(data_db)
-
-    for lang_code in existing_languages:
-        existing_doc = (
-            data_db.query(FamilyDocument)
-            .filter(FamilyDocument.import_id == import_id)
-            .one()
-            .physical_document
-        )
-        existing_lang = (
-            data_db.query(Language).filter(Language.language_code == lang_code).one()
-        )
-        existing_doc_lang = PhysicalDocumentLanguage(
-            language_id=existing_lang.id,
-            document_id=existing_doc.id,
-            source=LanguageSource.MODEL,
-        )
-        data_db.add(existing_doc_lang)
-        data_db.flush()
-        data_db.commit()
-
-    languages_to_add = ["eng", "fra"]
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-        "languages": languages_to_add,
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-
-    expected_languages = set(languages_to_add + existing_languages)
-    assert set(json_object["languages"]) == expected_languages
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-    doc_languages = (
-        data_db.query(PhysicalDocumentLanguage)
-        .filter(PhysicalDocumentLanguage.document_id == doc.id)
-        .filter(PhysicalDocumentLanguage.source == LanguageSource.MODEL)
-        .all()
-    )
-    assert len(doc_languages) == len(expected_languages)
-    for doc_lang in doc_languages:
-        lang = data_db.query(Language).filter(Language.id == doc_lang.language_id).one()
-        assert lang.language_code in expected_languages
-
-
-def test_update_document__idempotent(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-):
-    setup_with_two_docs(data_db)
-
-    import_id = "CCLW.executive.1.2"
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-    assert response.status_code == 200
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{import_id}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-
-    # Now Check the db
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-
-def test_update_document__works_on_slug(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-):
-    setup_with_two_docs(data_db)
-
-    slug = "DocSlug1"
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-    }
-
-    response = data_client.put(
-        f"/api/v1/admin/documents/{slug}",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == 200
-    json_object = response.json()
-    assert json_object["md5_sum"] == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert json_object["content_type"] == "updated/content_type"
-    assert json_object["cdn_object"] == "folder/file"
-
-    # Now Check the db
-    import_id = "CCLW.executive.1.2"
-    doc = (
-        data_db.query(FamilyDocument)
-        .filter(FamilyDocument.import_id == import_id)
-        .one()
-        .physical_document
-    )
-    assert doc.md5_sum == "c184214e-4870-48e0-adab-3e064b1b0e76"
-    assert doc.content_type == "updated/content_type"
-    assert doc.cdn_object == "folder/file"
-
-
-def test_update_document__status_422_when_not_found(
-    data_client: TestClient,
-    data_superuser_token_headers: dict[str, str],
-    data_db: Session,
-):
-    setup_with_two_docs(data_db)
-
-    payload = {
-        "md5_sum": "c184214e-4870-48e0-adab-3e064b1b0e76",
-        "content_type": "updated/content_type",
-        "cdn_object": "folder/file",
-    }
-
-    response = data_client.put(
-        "/api/v1/admin/documents/nothing",
-        headers=data_superuser_token_headers,
-        json=payload,
-    )
-
-    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_document_families.py
index 828ba603..e34f6051 100644
--- a/tests/non_search/routers/documents/test_get_document_families.py
+++ b/tests/non_search/routers/documents/test_get_document_families.py
@@ -1,9 +1,13 @@
 import pytest
 from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent
+from fastapi import status
 from fastapi.testclient import TestClient
 from sqlalchemy import update
 from sqlalchemy.orm import Session
 
+from tests.non_search.routers.documents.setup_doc_fam_lookup import (
+    _make_doc_fam_lookup_request,
+)
 from tests.non_search.setup_helpers import (
     setup_with_docs,
     setup_with_two_docs,
@@ -16,43 +20,41 @@
 
 
 def test_documents_family_slug_returns_not_found(
-    data_db: Session,
-    data_client: TestClient,
+    data_db: Session, data_client: TestClient, valid_token
 ):
     setup_with_docs(data_db)
     assert data_db.query(Family).count() == 1
     assert data_db.query(FamilyEvent).count() == 1
 
     # Test by slug
-    response = data_client.get(
-        "/api/v1/documents/FamSlug100",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug100",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
     )
-    assert response.status_code == 404
-    assert response.json()["detail"] == "Nothing found for FamSlug100"
+    assert json_response["detail"] == "Nothing found for FamSlug100"
 
 
 def test_documents_family_slug_returns_correct_family(
-    data_db: Session,
-    data_client: TestClient,
+    data_db: Session, data_client: TestClient, valid_token
 ):
     setup_with_two_docs(data_db)
 
     # Test by slug
-    response = data_client.get(
-        "/api/v1/documents/FamSlug1",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug1",
     )
-
-    json_response = response.json()
-    assert response.status_code == 200
     assert json_response["import_id"] == "CCLW.family.1001.0"
 
     # Ensure a different family is returned
-    response = data_client.get(
-        "/api/v1/documents/FamSlug2",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug2",
     )
-
-    json_response = response.json()
-    assert response.status_code == 200
     assert json_response["import_id"] == "CCLW.family.2002.0"
 
 
@@ -124,17 +126,21 @@ def test_documents_family_slug_returns_correct_family(
     ],
 )
 def test_documents_family_slug_returns_correct_json(
-    data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc
+    data_client: TestClient,
+    data_db: Session,
+    slug,
+    expected_fam,
+    expected_doc,
+    valid_token,
 ):
     setup_with_two_docs(data_db)
 
     # Test associations
-    response = data_client.get(
-        f"/api/v1/documents/{slug}",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        slug,
     )
-    json_response = response.json()
-
-    assert response.status_code == 200
 
     # Verify family data correct.
     assert len(json_response) == N_FAMILY_KEYS
@@ -170,23 +176,20 @@ def test_documents_family_slug_returns_correct_json(
 
 
 def test_documents_family_slug_returns_multiple_docs(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs_one_family(data_db)
 
-    response = data_client.get(
-        "/api/v1/documents/FamSlug1",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug1",
     )
-    json_response = response.json()
-
-    assert response.status_code == 200
     assert len(json_response["documents"]) == 2
 
 
 def test_documents_family_slug_returns_only_published_docs(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs_one_family(data_db)
     data_db.execute(
@@ -196,18 +199,16 @@ def test_documents_family_slug_returns_only_published_docs(
     )
 
     # Test associations
-    response = data_client.get(
-        "/api/v1/documents/FamSlug1",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug1",
     )
-    json_response = response.json()
-
-    assert response.status_code == 200
     assert len(json_response["documents"]) == 1
 
 
 def test_documents_family_slug_returns_404_when_all_docs_deleted(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs_one_family(data_db)
     data_db.execute(
@@ -222,29 +223,30 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted(
     )
 
     # Test associations
-    response = data_client.get(
-        "/api/v1/documents/FamSlug1",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "FamSlug1",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
     )
-    json_response = response.json()
-
-    assert response.status_code == 404
     assert json_response["detail"] == "Family CCLW.family.1001.0 is not published"
 
 
 def test_documents_doc_slug_returns_not_found(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_docs(data_db)
     assert data_db.query(Family).count() == 1
     assert data_db.query(FamilyEvent).count() == 1
 
     # Test associations
-    response = data_client.get(
-        "/api/v1/documents/DocSlug100",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "DocSlug100",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
     )
-    assert response.status_code == 404
-    assert response.json()["detail"] == "Nothing found for DocSlug100"
+    assert json_response["detail"] == "Nothing found for DocSlug100"
 
 
 @pytest.mark.parametrize(
@@ -307,15 +309,20 @@ def test_documents_doc_slug_returns_not_found(
     ],
 )
 def test_documents_doc_slug_preexisting_objects(
-    data_client: TestClient, data_db: Session, slug, expected_fam, expected_doc
+    data_client: TestClient,
+    data_db: Session,
+    slug,
+    expected_fam,
+    expected_doc,
+    valid_token,
 ):
     setup_with_two_docs(data_db)
 
-    response = data_client.get(
-        f"/api/v1/documents/{slug}",
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        slug,
     )
-    json_response = response.json()
-    assert response.status_code == 200
     assert len(json_response) == 2
 
     family = json_response["family"]
@@ -330,8 +337,7 @@ def test_documents_doc_slug_preexisting_objects(
 
 
 def test_documents_doc_slug_when_deleted(
-    data_client: TestClient,
-    data_db: Session,
+    data_client: TestClient, data_db: Session, valid_token
 ):
     setup_with_two_docs(data_db)
     data_db.execute(
@@ -339,9 +345,11 @@ def test_documents_doc_slug_when_deleted(
         .where(FamilyDocument.import_id == "CCLW.executive.2.2")
         .values(document_status="Deleted")
     )
-    response = data_client.get(
-        "/api/v1/documents/DocSlug2",
+
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "DocSlug2",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
     )
-    json_response = response.json()
-    assert response.status_code == 404
     assert json_response["detail"] == "The document CCLW.executive.2.2 is not published"

From 0539d7400bffbb04b7bf750e69a0c5e682a6374a Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:32:48 +0000
Subject: [PATCH 02/26] Driveby: Add CORS tests for MCF

---
 tests/non_search/routers/lookups/test_cors.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/non_search/routers/lookups/test_cors.py b/tests/non_search/routers/lookups/test_cors.py
index b7e15002..3a67e483 100644
--- a/tests/non_search/routers/lookups/test_cors.py
+++ b/tests/non_search/routers/lookups/test_cors.py
@@ -19,6 +19,13 @@
         ("https://app.devclimatepolicyradar.com", False),  # prefixed wrong domain
         ("https://app-climatepolicyradar.com", False),  # prefixed wrong domain
         ("https://prefix-climate-laws.org", False),  # climate laws prefixed domain
+        ("https://.climateprojectexplorer.org", False),  # empty subdomain
+        ("https://prefix-climateprojectexplorer.org", False),  # MCF prefixed domain
+        ("https://climateprojectexplorer.org", True),  # base MCF URL
+        (
+            "https://preview.climateprojectexplorer.org",
+            True,
+        ),  # MCF subdomain URL
     ],
 )
 def test_cors_regex(test_client, origin, should_be_allowed):

From 4a164d8c3a7e50fc9dab5d25fa0fc1d26f83af93 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:45:13 +0000
Subject: [PATCH 03/26] Update slug lookup query to respect allowed corpora

---
 app/repository/document.py         | 55 +++++++++++++++++++++++++-----
 app/repository/sql/slug_lookup.sql | 20 +++++++++++
 2 files changed, 67 insertions(+), 8 deletions(-)
 create mode 100644 app/repository/sql/slug_lookup.sql

diff --git a/app/repository/document.py b/app/repository/document.py
index c97d9744..e5e53ed1 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -5,7 +5,9 @@
 """
 
 import logging
+import os
 from datetime import datetime
+from functools import lru_cache
 from typing import Optional, Sequence, cast
 
 from db_client.models.dfce.collection import Collection, CollectionFamily
@@ -40,11 +42,38 @@
 _LOGGER = logging.getLogger(__file__)
 
 
+@lru_cache()
+def _get_query_template():
+    """Read query for non-deleted docs and their associated data."""
+    with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file:
+        return file.read()
+
+
+def get_slugged_object_from_allowed_corpora_query(
+    template_query, slug_name: str, allowed_corpora_ids: list[str]
+) -> str:
+    """Create download whole database query, replacing variables.
+
+    :param str ingest_cycle_start: The current ingest cycle date.
+    :param list[str] allowed_corpora_ids: The corpora from which we
+        should allow the data to be dumped.
+    :return str: The SQL query to perform on the database session.
+    """
+    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
+    return template_query.replace("{slug_name}", slug_name).replace(  # type: ignore
+        "{allowed_corpora_ids}", corpora_ids
+    )  # type: ignore
+
+
 def get_slugged_objects(
     db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
 ) -> tuple[Optional[str], Optional[str]]:
     """Match the slug name to a FamilyDocument or Family import ID.
 
+    This function also contains logic to only get the import ID for the
+    family or document if the slug given is associated with a family
+    that belongs to the list of allowed corpora.
+
     :param Session db: connection to db
     :param str slug: slug name to match
     :param Optional[list[str]] allowed_corpora: The corpora IDs to look
@@ -52,18 +81,28 @@ def get_slugged_objects(
     :return tuple[Optional[str], Optional[str]]: the FamilyDocument
         import id or the Family import_id.
     """
-    query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
-        Slug.name == slug
-    )
-
-    # # Only get slug for the fam/doc if it belongs to the list of allowed corpora.
-    # if allowed_corpora is not None:
-    #     query = query.filter()
+    if allowed_corpora is not None:
+        query_template = _get_query_template()
+        query = get_slugged_object_from_allowed_corpora_query(
+            query_template, slug, allowed_corpora
+        )
+        query = db.execute(query)
+    else:
+        query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
+            Slug.name == slug
+        )
 
     result = query.one_or_none()
     if result is None:
         return (None, None)
-    return result
+
+    DOC_INDEX = 0
+    doc_id = cast(str, result[DOC_INDEX]) if result[DOC_INDEX] is not None else None
+
+    FAM_INDEX = 1
+    fam_id = cast(str, result[FAM_INDEX]) if result[FAM_INDEX] is not None else None
+
+    return doc_id, fam_id
 
 
 def get_family_document_and_context(
diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql
new file mode 100644
index 00000000..9d649067
--- /dev/null
+++ b/app/repository/sql/slug_lookup.sql
@@ -0,0 +1,20 @@
+SELECT
+    slug.family_document_import_id, slug.family_import_id
+FROM slug
+LEFT JOIN family ON family.import_id = slug.family_import_id
+LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
+LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
+WHERE slug.name = '{slug_name}'
+AND corpus.import_id IN ({allowed_corpora_ids})
+
+UNION
+
+SELECT
+    slug.family_document_import_id, slug.family_import_id
+FROM slug
+LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id
+LEFT JOIN family ON family.import_id = family_document.family_import_id
+LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
+LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
+WHERE slug.name = '{slug_name}'
+AND corpus.import_id IN ({allowed_corpora_ids});

From b43aabf8ec6f73426098cd41031297318b4448b1 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:45:38 +0000
Subject: [PATCH 04/26] Include actual CCLW corpus ID in test token

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index d897290d..02c2d85c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -117,7 +117,7 @@ def valid_token(monkeypatch):
     def mock_return(_, __, ___):
         return True
 
-    corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0"
+    corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000"
     subject = "CCLW"
     audience = "localhost"
     input_str = f"{corpora_ids};{subject};{audience}"

From e2e2d5cbf10a4cc0ff382a62babbdce0b50741c7 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:46:12 +0000
Subject: [PATCH 05/26] Bump to 1.19.11

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 85e6896a..0a2c8f23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.10"
+version = "1.19.11"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]

From 45fa716fc11bac6c70c7a3f4fdac39d081370a97 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:50:52 +0000
Subject: [PATCH 06/26] Refactor _get_query_template

---
 app/repository/document.py | 13 ++++---------
 app/repository/download.py |  8 --------
 app/repository/helpers.py  | 14 ++++++++++++++
 app/repository/pipeline.py | 11 ++---------
 app/service/download.py    |  8 ++++++--
 5 files changed, 26 insertions(+), 28 deletions(-)
 create mode 100644 app/repository/helpers.py

diff --git a/app/repository/document.py b/app/repository/document.py
index e5e53ed1..57579a26 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -7,7 +7,6 @@
 import logging
 import os
 from datetime import datetime
-from functools import lru_cache
 from typing import Optional, Sequence, cast
 
 from db_client.models.dfce.collection import Collection, CollectionFamily
@@ -36,19 +35,13 @@
     LinkableFamily,
 )
 from app.repository.geography import get_geo_subquery
+from app.repository.helpers import get_query_template
 from app.repository.lookups import doc_type_from_family_document_metadata
 from app.service.util import to_cdn_url
 
 _LOGGER = logging.getLogger(__file__)
 
 
-@lru_cache()
-def _get_query_template():
-    """Read query for non-deleted docs and their associated data."""
-    with open(os.path.join("app", "repository", "sql", "slug_lookup.sql"), "r") as file:
-        return file.read()
-
-
 def get_slugged_object_from_allowed_corpora_query(
     template_query, slug_name: str, allowed_corpora_ids: list[str]
 ) -> str:
@@ -82,7 +75,9 @@ def get_slugged_objects(
         import id or the Family import_id.
     """
     if allowed_corpora is not None:
-        query_template = _get_query_template()
+        query_template = get_query_template(
+            os.path.join("app", "repository", "sql", "slug_lookup.sql")
+        )
         query = get_slugged_object_from_allowed_corpora_query(
             query_template, slug, allowed_corpora
         )
diff --git a/app/repository/download.py b/app/repository/download.py
index 69938182..b0381e8d 100644
--- a/app/repository/download.py
+++ b/app/repository/download.py
@@ -1,7 +1,5 @@
 """Functions to support browsing the RDS document structure"""
 
-import os
-from functools import lru_cache
 from logging import getLogger
 
 import pandas as pd
@@ -12,12 +10,6 @@
 _LOGGER = getLogger(__name__)
 
 
-@lru_cache()
-def _get_query_template():
-    with open(os.path.join("app", "repository", "sql", "download.sql"), "r") as file:
-        return file.read()
-
-
 def get_whole_database_dump(query, db=Depends(get_db)):
     with db.connection() as conn:
         df = pd.read_sql(query, conn.connection)
diff --git a/app/repository/helpers.py b/app/repository/helpers.py
new file mode 100644
index 00000000..e976683b
--- /dev/null
+++ b/app/repository/helpers.py
@@ -0,0 +1,14 @@
+"""
+Functions to support the documents endpoints
+
+old functions (non DFC) are moved to the deprecated_documents.py file.
+"""
+
+from functools import lru_cache
+
+
+@lru_cache()
+def get_query_template(filepath: str) -> str:
+    """Read query for non-deleted docs and their associated data."""
+    with open(filepath, "r") as file:
+        return file.read()
diff --git a/app/repository/pipeline.py b/app/repository/pipeline.py
index 0eebaff4..d9eac447 100644
--- a/app/repository/pipeline.py
+++ b/app/repository/pipeline.py
@@ -1,7 +1,6 @@
 import logging
 import os
 from datetime import datetime, timezone
-from functools import lru_cache
 from typing import Sequence, cast
 
 import pandas as pd
@@ -11,19 +10,13 @@
 
 from app.clients.db.session import get_db
 from app.models.document import DocumentParserInput
+from app.repository.helpers import get_query_template
 
 _LOGGER = logging.getLogger(__name__)
 
 MetadataType = dict[str, list[str]]
 
 
-@lru_cache()
-def generate_pipeline_ingest_input_query():
-    """Read query for non-deleted docs and their associated data."""
-    with open(os.path.join("app", "repository", "sql", "pipeline.sql"), "r") as file:
-        return file.read()
-
-
 def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame:
     """Get non-deleted docs and their associated data from the db.
 
@@ -39,7 +32,7 @@ def get_pipeline_data(db=Depends(get_db)) -> pd.DataFrame:
         in database.
     """
     _LOGGER.info("Running pipeline query")
-    query = generate_pipeline_ingest_input_query()
+    query = get_query_template(os.path.join("app", "repository", "sql", "pipeline.sql"))
     df = pd.read_sql(query, db.connection().connection)
     return df
 
diff --git a/app/service/download.py b/app/service/download.py
index dad6078d..189b70fb 100644
--- a/app/service/download.py
+++ b/app/service/download.py
@@ -1,5 +1,6 @@
 """Functions to support browsing the RDS document structure"""
 
+import os
 import zipfile
 from io import BytesIO, StringIO
 from logging import getLogger
@@ -9,7 +10,8 @@
 from fastapi import Depends
 
 from app.clients.db.session import get_db
-from app.repository.download import _get_query_template, get_whole_database_dump
+from app.repository.download import get_whole_database_dump
+from app.repository.helpers import get_query_template
 
 _LOGGER = getLogger(__name__)
 
@@ -90,7 +92,9 @@ def create_data_download_zip_archive(
     ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
 ):
     readme_buffer = generate_data_dump_readme(ingest_cycle_start)
-    query_template = _get_query_template()
+    query_template = get_query_template(
+        os.path.join("app", "repository", "sql", "download.sql")
+    )
     query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
     csv_buffer = generate_data_dump_as_csv(query, db)
 

From 6010fce97e0ace4aefead684f9d248d1a716d52f Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:53:51 +0000
Subject: [PATCH 07/26] Refactor doc and fam lookup tests

---
 .../documents/test_document_families.py       |  48 -----
 .../routers/documents/test_get_document.py    | 178 ++++++++++++++++++
 ...ocument_families.py => test_get_family.py} | 124 ------------
 3 files changed, 178 insertions(+), 172 deletions(-)
 delete mode 100644 tests/non_search/routers/documents/test_document_families.py
 create mode 100644 tests/non_search/routers/documents/test_get_document.py
 rename tests/non_search/routers/documents/{test_get_document_families.py => test_get_family.py} (65%)

diff --git a/tests/non_search/routers/documents/test_document_families.py b/tests/non_search/routers/documents/test_document_families.py
deleted file mode 100644
index 8478fc50..00000000
--- a/tests/non_search/routers/documents/test_document_families.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from db_client.models.document.physical_document import PhysicalDocumentLanguage
-from fastapi.testclient import TestClient
-from sqlalchemy import update
-from sqlalchemy.orm import Session
-
-from tests.non_search.routers.documents.setup_doc_fam_lookup import (
-    _make_doc_fam_lookup_request,
-)
-from tests.non_search.setup_helpers import (
-    setup_with_two_docs,
-    setup_with_two_docs_multiple_languages,
-)
-
-
-def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token):
-    setup_with_two_docs(data_db)
-
-    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
-    document = json_response["document"]
-    print(json_response)
-    assert document["languages"] == ["eng"]
-
-    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2")
-    document = json_response["document"]
-    assert document["languages"] == []
-
-
-def test_physical_doc_languages_not_visible(
-    data_client: TestClient, data_db: Session, valid_token
-):
-    setup_with_two_docs(data_db)
-    data_db.execute(update(PhysicalDocumentLanguage).values(visible=False))
-
-    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
-    document = json_response["document"]
-    print(json_response)
-    assert document["languages"] == []
-
-
-def test_physical_doc_multiple_languages(
-    data_client: TestClient, data_db: Session, valid_token
-):
-    setup_with_two_docs_multiple_languages(data_db)
-
-    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
-    document = json_response["document"]
-    print(json_response)
-    assert set(document["languages"]) == set(["fra", "eng"])
diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py
new file mode 100644
index 00000000..f7b0f665
--- /dev/null
+++ b/tests/non_search/routers/documents/test_get_document.py
@@ -0,0 +1,178 @@
+import pytest
+from db_client.models.dfce.family import Family, FamilyDocument, FamilyEvent
+from db_client.models.document.physical_document import PhysicalDocumentLanguage
+from fastapi import status
+from fastapi.testclient import TestClient
+from sqlalchemy import update
+from sqlalchemy.orm import Session
+
+from tests.non_search.routers.documents.setup_doc_fam_lookup import (
+    _make_doc_fam_lookup_request,
+)
+from tests.non_search.setup_helpers import (
+    setup_with_docs,
+    setup_with_two_docs,
+    setup_with_two_docs_multiple_languages,
+)
+
+N_FAMILY_OVERVIEW_KEYS = 8
+N_DOCUMENT_KEYS = 12
+
+
+def test_physical_doc_languages(data_client: TestClient, data_db: Session, valid_token):
+    setup_with_two_docs(data_db)
+
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
+    document = json_response["document"]
+    print(json_response)
+    assert document["languages"] == ["eng"]
+
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug2")
+    document = json_response["document"]
+    assert document["languages"] == []
+
+
+def test_physical_doc_languages_not_visible(
+    data_client: TestClient, data_db: Session, valid_token
+):
+    setup_with_two_docs(data_db)
+    data_db.execute(update(PhysicalDocumentLanguage).values(visible=False))
+
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
+    document = json_response["document"]
+    print(json_response)
+    assert document["languages"] == []
+
+
+def test_physical_doc_multiple_languages(
+    data_client: TestClient, data_db: Session, valid_token
+):
+    setup_with_two_docs_multiple_languages(data_db)
+
+    json_response = _make_doc_fam_lookup_request(data_client, valid_token, "DocSlug1")
+    document = json_response["document"]
+    print(json_response)
+    assert set(document["languages"]) == set(["fra", "eng"])
+
+
+def test_documents_doc_slug_returns_not_found(
+    data_client: TestClient, data_db: Session, valid_token
+):
+    setup_with_docs(data_db)
+    assert data_db.query(Family).count() == 1
+    assert data_db.query(FamilyEvent).count() == 1
+
+    # Test associations
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "DocSlug100",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
+    )
+    assert json_response["detail"] == "Nothing found for DocSlug100"
+
+
+@pytest.mark.parametrize(
+    ("slug", "expected_fam", "expected_doc"),
+    [
+        (
+            "DocSlug1",
+            {
+                "title": "Fam1",
+                "import_id": "CCLW.family.1001.0",
+                "geographies": ["South Asia"],
+                "category": "Executive",
+                "slug": "FamSlug1",
+                "corpus_id": "CCLW.corpus.i00000001.n0000",
+                "published_date": "2019-12-25T00:00:00Z",
+                "last_updated_date": "2019-12-25T00:00:00Z",
+            },
+            {
+                "import_id": "CCLW.executive.1.2",
+                "variant": "Original Language",
+                "slug": "DocSlug1",
+                "title": "Document1",
+                "md5_sum": "111",
+                "cdn_object": None,
+                "content_type": "application/pdf",
+                "source_url": "http://somewhere1",
+                "language": "eng",
+                "languages": ["eng"],
+                "document_type": "Plan",
+                "document_role": "MAIN",
+            },
+        ),
+        (
+            "DocSlug2",
+            {
+                "title": "Fam2",
+                "import_id": "CCLW.family.2002.0",
+                "geographies": ["AFG", "IND"],
+                "category": "Executive",
+                "slug": "FamSlug2",
+                "corpus_id": "CCLW.corpus.i00000001.n0000",
+                "published_date": "2019-12-25T00:00:00Z",
+                "last_updated_date": "2019-12-25T00:00:00Z",
+            },
+            {
+                "import_id": "CCLW.executive.2.2",
+                "variant": None,
+                "slug": "DocSlug2",
+                "title": "Document2",
+                "md5_sum": None,
+                "cdn_object": None,
+                "content_type": None,
+                "source_url": "http://another_somewhere",
+                "language": "",
+                "languages": [],
+                "document_type": "Order",
+                "document_role": "MAIN",
+            },
+        ),
+    ],
+)
+def test_documents_doc_slug_preexisting_objects(
+    data_client: TestClient,
+    data_db: Session,
+    slug,
+    expected_fam,
+    expected_doc,
+    valid_token,
+):
+    setup_with_two_docs(data_db)
+
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        slug,
+    )
+    assert len(json_response) == 2
+
+    family = json_response["family"]
+    assert family
+    assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS
+    assert family == expected_fam
+
+    doc = json_response["document"]
+    assert doc
+    assert len(doc) == N_DOCUMENT_KEYS
+    assert doc == expected_doc
+
+
+def test_documents_doc_slug_when_deleted(
+    data_client: TestClient, data_db: Session, valid_token
+):
+    setup_with_two_docs(data_db)
+    data_db.execute(
+        update(FamilyDocument)
+        .where(FamilyDocument.import_id == "CCLW.executive.2.2")
+        .values(document_status="Deleted")
+    )
+
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        valid_token,
+        "DocSlug2",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
+    )
+    assert json_response["detail"] == "The document CCLW.executive.2.2 is not published"
diff --git a/tests/non_search/routers/documents/test_get_document_families.py b/tests/non_search/routers/documents/test_get_family.py
similarity index 65%
rename from tests/non_search/routers/documents/test_get_document_families.py
rename to tests/non_search/routers/documents/test_get_family.py
index e34f6051..ecdc620f 100644
--- a/tests/non_search/routers/documents/test_get_document_families.py
+++ b/tests/non_search/routers/documents/test_get_family.py
@@ -15,7 +15,6 @@
 )
 
 N_FAMILY_KEYS = 15
-N_FAMILY_OVERVIEW_KEYS = 8
 N_DOCUMENT_KEYS = 12
 
 
@@ -230,126 +229,3 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted(
         expected_status_code=status.HTTP_404_NOT_FOUND,
     )
     assert json_response["detail"] == "Family CCLW.family.1001.0 is not published"
-
-
-def test_documents_doc_slug_returns_not_found(
-    data_client: TestClient, data_db: Session, valid_token
-):
-    setup_with_docs(data_db)
-    assert data_db.query(Family).count() == 1
-    assert data_db.query(FamilyEvent).count() == 1
-
-    # Test associations
-    json_response = _make_doc_fam_lookup_request(
-        data_client,
-        valid_token,
-        "DocSlug100",
-        expected_status_code=status.HTTP_404_NOT_FOUND,
-    )
-    assert json_response["detail"] == "Nothing found for DocSlug100"
-
-
-@pytest.mark.parametrize(
-    ("slug", "expected_fam", "expected_doc"),
-    [
-        (
-            "DocSlug1",
-            {
-                "title": "Fam1",
-                "import_id": "CCLW.family.1001.0",
-                "geographies": ["South Asia"],
-                "category": "Executive",
-                "slug": "FamSlug1",
-                "corpus_id": "CCLW.corpus.i00000001.n0000",
-                "published_date": "2019-12-25T00:00:00Z",
-                "last_updated_date": "2019-12-25T00:00:00Z",
-            },
-            {
-                "import_id": "CCLW.executive.1.2",
-                "variant": "Original Language",
-                "slug": "DocSlug1",
-                "title": "Document1",
-                "md5_sum": "111",
-                "cdn_object": None,
-                "content_type": "application/pdf",
-                "source_url": "http://somewhere1",
-                "language": "eng",
-                "languages": ["eng"],
-                "document_type": "Plan",
-                "document_role": "MAIN",
-            },
-        ),
-        (
-            "DocSlug2",
-            {
-                "title": "Fam2",
-                "import_id": "CCLW.family.2002.0",
-                "geographies": ["AFG", "IND"],
-                "category": "Executive",
-                "slug": "FamSlug2",
-                "corpus_id": "CCLW.corpus.i00000001.n0000",
-                "published_date": "2019-12-25T00:00:00Z",
-                "last_updated_date": "2019-12-25T00:00:00Z",
-            },
-            {
-                "import_id": "CCLW.executive.2.2",
-                "variant": None,
-                "slug": "DocSlug2",
-                "title": "Document2",
-                "md5_sum": None,
-                "cdn_object": None,
-                "content_type": None,
-                "source_url": "http://another_somewhere",
-                "language": "",
-                "languages": [],
-                "document_type": "Order",
-                "document_role": "MAIN",
-            },
-        ),
-    ],
-)
-def test_documents_doc_slug_preexisting_objects(
-    data_client: TestClient,
-    data_db: Session,
-    slug,
-    expected_fam,
-    expected_doc,
-    valid_token,
-):
-    setup_with_two_docs(data_db)
-
-    json_response = _make_doc_fam_lookup_request(
-        data_client,
-        valid_token,
-        slug,
-    )
-    assert len(json_response) == 2
-
-    family = json_response["family"]
-    assert family
-    assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS
-    assert family == expected_fam
-
-    doc = json_response["document"]
-    assert doc
-    assert len(doc) == N_DOCUMENT_KEYS
-    assert doc == expected_doc
-
-
-def test_documents_doc_slug_when_deleted(
-    data_client: TestClient, data_db: Session, valid_token
-):
-    setup_with_two_docs(data_db)
-    data_db.execute(
-        update(FamilyDocument)
-        .where(FamilyDocument.import_id == "CCLW.executive.2.2")
-        .values(document_status="Deleted")
-    )
-
-    json_response = _make_doc_fam_lookup_request(
-        data_client,
-        valid_token,
-        "DocSlug2",
-        expected_status_code=status.HTTP_404_NOT_FOUND,
-    )
-    assert json_response["detail"] == "The document CCLW.executive.2.2 is not published"

From 7fee18f2a90a9c00fce394e013ca431c7c9025a6 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 11:11:19 +0000
Subject: [PATCH 08/26] Add integration tests for doc/fam lookup when corpora
 mismatch

---
 .../routers/documents/test_get_document.py        | 15 +++++++++++++++
 .../routers/documents/test_get_family.py          | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tests/non_search/routers/documents/test_get_document.py b/tests/non_search/routers/documents/test_get_document.py
index f7b0f665..dafa1642 100644
--- a/tests/non_search/routers/documents/test_get_document.py
+++ b/tests/non_search/routers/documents/test_get_document.py
@@ -176,3 +176,18 @@ def test_documents_doc_slug_when_deleted(
         expected_status_code=status.HTTP_404_NOT_FOUND,
     )
     assert json_response["detail"] == "The document CCLW.executive.2.2 is not published"
+
+
+def test_documents_doc_slug_returns_404_when_corpora_mismatch(
+    data_client: TestClient, data_db: Session, alternative_token
+):
+    setup_with_two_docs(data_db)
+
+    # Test associations
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        alternative_token,
+        "DocSlug1",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
+    )
+    assert json_response["detail"] == "Nothing found for DocSlug1"
diff --git a/tests/non_search/routers/documents/test_get_family.py b/tests/non_search/routers/documents/test_get_family.py
index ecdc620f..e149f2ca 100644
--- a/tests/non_search/routers/documents/test_get_family.py
+++ b/tests/non_search/routers/documents/test_get_family.py
@@ -229,3 +229,18 @@ def test_documents_family_slug_returns_404_when_all_docs_deleted(
         expected_status_code=status.HTTP_404_NOT_FOUND,
     )
     assert json_response["detail"] == "Family CCLW.family.1001.0 is not published"
+
+
+def test_documents_family_slug_returns_404_when_corpora_mismatch(
+    data_client: TestClient, data_db: Session, alternative_token
+):
+    setup_with_two_docs_one_family(data_db)
+
+    # Test associations
+    json_response = _make_doc_fam_lookup_request(
+        data_client,
+        alternative_token,
+        "FamSlug1",
+        expected_status_code=status.HTTP_404_NOT_FOUND,
+    )
+    assert json_response["detail"] == "Nothing found for FamSlug1"

From 2a830dbe9e8d1b208f3592d9b55d9eef04f87e11 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 11:11:31 +0000
Subject: [PATCH 09/26] Add alternative corpora token

---
 tests/conftest.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 02c2d85c..6d6fb4cb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -127,6 +127,30 @@ def mock_return(_, __, ___):
     return af.create_configuration_token(input_str)
 
 
+@pytest.fixture
+def alternative_token(monkeypatch):
+    """Generate a valid alternative config token using TOKEN_SECRET_KEY.
+
+    Need to generate the config token using the token secret key from
+    your local env file. For tests in CI, this will be the secret key in
+    the .env.example file, but for local development this secret key
+    might be different (e.g., the one for staging). This fixture works
+    around this.
+    """
+
+    def mock_return(_, __, ___):
+        return True
+
+    corpora_ids = "UNFCCC.corpus.i00000001.n0000"
+    subject = "CPR"
+    audience = "localhost"
+    input_str = f"{corpora_ids};{subject};{audience}"
+
+    af = AppTokenFactory()
+    monkeypatch.setattr(custom_app.AppTokenFactory, "validate", mock_return)
+    return af.create_configuration_token(input_str)
+
+
 @pytest.fixture
 def create_test_db():
     """Create a test database and use it for the whole test session."""

From 024d60bbbbb266b57bcc5a48f4f2ac23e64808b0 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Tue, 5 Nov 2024 11:21:14 +0000
Subject: [PATCH 10/26] Refactor download code

---
 app/repository/download.py | 29 ++++++++++++++++++++++++++++-
 app/service/download.py    | 33 ++++++---------------------------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/app/repository/download.py b/app/repository/download.py
index b0381e8d..1ed90396 100644
--- a/app/repository/download.py
+++ b/app/repository/download.py
@@ -1,16 +1,43 @@
 """Functions to support browsing the RDS document structure"""
 
+import os
 from logging import getLogger
 
 import pandas as pd
 from fastapi import Depends
 
 from app.clients.db.session import get_db
+from app.repository.helpers import get_query_template
 
 _LOGGER = getLogger(__name__)
 
 
-def get_whole_database_dump(query, db=Depends(get_db)):
+def create_query(
+    template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
+) -> str:
+    """Create download whole database query, replacing variables.
+
+    :param str ingest_cycle_start: The current ingest cycle date.
+    :param list[str] allowed_corpora_ids: The corpora from which we
+        should allow the data to be dumped.
+    :return str: The SQL query to perform on the database session.
+    """
+    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
+    return template_query.replace(  # type: ignore
+        "{ingest_cycle_start}", ingest_cycle_start
+    ).replace(
+        "{allowed_corpora_ids}", corpora_ids
+    )  # type: ignore
+
+
+def get_whole_database_dump(
+    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
+):
+    query_template = get_query_template(
+        os.path.join("app", "repository", "sql", "download.sql")
+    )
+    query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
+
     with db.connection() as conn:
         df = pd.read_sql(query, conn.connection)
         return df
diff --git a/app/service/download.py b/app/service/download.py
index 189b70fb..d5110b01 100644
--- a/app/service/download.py
+++ b/app/service/download.py
@@ -1,6 +1,5 @@
 """Functions to support browsing the RDS document structure"""
 
-import os
 import zipfile
 from io import BytesIO, StringIO
 from logging import getLogger
@@ -11,29 +10,10 @@
 
 from app.clients.db.session import get_db
 from app.repository.download import get_whole_database_dump
-from app.repository.helpers import get_query_template
 
 _LOGGER = getLogger(__name__)
 
 
-def create_query(
-    template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
-
-    :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
-        should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
-    """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace(  # type: ignore
-        "{ingest_cycle_start}", ingest_cycle_start
-    ).replace(
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
 def replace_slug_with_qualified_url(
     df: pd.DataFrame,
     public_app_url: str,
@@ -63,8 +43,10 @@ def convert_dump_to_csv(df: pd.DataFrame):
     return csv_buffer
 
 
-def generate_data_dump_as_csv(query, db=Depends(get_db)):
-    df = get_whole_database_dump(query, db)
+def generate_data_dump_as_csv(
+    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
+):
+    df = get_whole_database_dump(ingest_cycle_start, allowed_corpora_ids, db)
     csv = convert_dump_to_csv(df)
     csv.seek(0)
     return csv
@@ -92,11 +74,8 @@ def create_data_download_zip_archive(
     ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
 ):
     readme_buffer = generate_data_dump_readme(ingest_cycle_start)
-    query_template = get_query_template(
-        os.path.join("app", "repository", "sql", "download.sql")
-    )
-    query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
-    csv_buffer = generate_data_dump_as_csv(query, db)
+
+    csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, allowed_corpora_ids, db)
 
     zip_buffer = BytesIO()
     with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:

From c4a50b5d8b1615f46accc1cb448dccc53988b559 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Mon, 25 Nov 2024 14:17:24 +0000
Subject: [PATCH 11/26] Geo

---
 app/api/api_v1/routers/geographies.py         |  21 ++-
 app/repository/document.py                    |  24 ++--
 app/repository/geography.py                   |  75 +++++++++-
 app/repository/sql/world_map.sql              |  86 +++++++++++
 makefile-docker.defs                          |   2 +-
 tests/conftest.py                             |   4 +-
 tests/non_search/conftest.py                  |  45 ------
 .../geographies/setup_world_map_helpers.py    |  61 +++++++-
 .../geographies/test_world_map_summary.py     | 136 +++++++++++++-----
 9 files changed, 348 insertions(+), 106 deletions(-)
 create mode 100644 app/repository/sql/world_map.sql
 delete mode 100644 tests/non_search/conftest.py

diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/geographies.py
index 6eff0cda..b92084d3 100644
--- a/app/api/api_v1/routers/geographies.py
+++ b/app/api/api_v1/routers/geographies.py
@@ -1,11 +1,13 @@
 import logging
+from typing import Annotated
 
-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 
 from app.clients.db.session import get_db
 from app.errors import RepositoryError
 from app.models.geography import GeographyStatsDTO
 from app.repository.geography import get_world_map_stats
+from app.service.custom_app import AppTokenFactory
 
 _LOGGER = logging.getLogger(__file__)
 
@@ -13,12 +15,23 @@
 
 
 @geographies_router.get("/geographies", response_model=list[GeographyStatsDTO])
-async def geographies(db=Depends(get_db)):
+async def geographies(
+    request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
+):
     """Get a summary of fam stats for all geographies for world map."""
-    _LOGGER.info("Getting detailed information on all geographies")
+    _LOGGER.info(
+        "Getting world map counts for all geographies",
+        extra={
+            "props": {"app_token": str(app_token)},
+        },
+    )
+
+    # Decode the app token and validate it.
+    token = AppTokenFactory()
+    token.decode_and_validate(db, request, app_token)
 
     try:
-        world_map_stats = get_world_map_stats(db)
+        world_map_stats = get_world_map_stats(db, token.allowed_corpora_ids)
 
         if world_map_stats == []:
             _LOGGER.error("No stats for world map found")
diff --git a/app/repository/document.py b/app/repository/document.py
index 57579a26..8a3075f2 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -23,6 +23,7 @@
 from db_client.models.document.physical_document import PhysicalDocument
 from db_client.models.organisation.organisation import Organisation
 from sqlalchemy import func
+from sqlalchemy.engine import Result
 from sqlalchemy.orm import Session
 
 from app.models.document import (
@@ -43,8 +44,8 @@
 
 
 def get_slugged_object_from_allowed_corpora_query(
-    template_query, slug_name: str, allowed_corpora_ids: list[str]
-) -> str:
+    db: Session, slug_name: str, allowed_corpora_ids: list[str]
+) -> Result:
     """Create download whole database query, replacing variables.
 
     :param str ingest_cycle_start: The current ingest cycle date.
@@ -52,10 +53,13 @@ def get_slugged_object_from_allowed_corpora_query(
         should allow the data to be dumped.
     :return str: The SQL query to perform on the database session.
     """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace("{slug_name}", slug_name).replace(  # type: ignore
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
+    query_template = get_query_template(
+        os.path.join("app", "repository", "sql", "slug_lookup.sql")
+    )
+    return db.execute(
+        query_template,
+        {"slug_name": slug_name, "allowed_corpora_ids": allowed_corpora_ids},
+    )
 
 
 def get_slugged_objects(
@@ -75,13 +79,7 @@ def get_slugged_objects(
         import id or the Family import_id.
     """
     if allowed_corpora is not None:
-        query_template = get_query_template(
-            os.path.join("app", "repository", "sql", "slug_lookup.sql")
-        )
-        query = get_slugged_object_from_allowed_corpora_query(
-            query_template, slug, allowed_corpora
-        )
-        query = db.execute(query)
+        query = get_slugged_object_from_allowed_corpora_query(db, slug, allowed_corpora)
     else:
         query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
             Slug.name == slug
diff --git a/app/repository/geography.py b/app/repository/geography.py
index e4b23a37..d0eea962 100644
--- a/app/repository/geography.py
+++ b/app/repository/geography.py
@@ -4,13 +4,16 @@
 from typing import Optional, Sequence
 
 from db_client.models.dfce.family import (
+    Corpus,
     Family,
+    FamilyCorpus,
     FamilyDocument,
     FamilyGeography,
     FamilyStatus,
 )
 from db_client.models.dfce.geography import Geography
 from sqlalchemy import func
+from sqlalchemy.dialects import postgresql
 from sqlalchemy.exc import OperationalError
 from sqlalchemy.orm import Query, Session
 
@@ -63,7 +66,9 @@ def get_geo_subquery(
     return geo_subquery.subquery("geo_subquery")
 
 
-def _db_count_fams_in_category_and_geo(db: Session) -> Query:
+def _db_count_fams_in_category_and_geo(
+    db: Session, allowed_corpora: Optional[list[str]]
+) -> Query:
     """
     Query the database for the fam count per category per geo.
 
@@ -72,6 +77,8 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query:
     returned.
 
     :param Session db: DB Session to perform query on.
+    :param Optional[list[str]] allowed_corpora: The list of allowed
+        corpora IDs to filter on.
     :return Query: A Query object containing the queries to perform.
     """
     # Get the required Geography information and cross join each with all of the unique
@@ -94,10 +101,18 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query:
             func.count().label("records_count"),
         )
         .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
+        # .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id)
+        # .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id)
         .filter(Family.family_status == FamilyStatus.PUBLISHED)
         .group_by(Family.family_category, FamilyGeography.geography_id)
         .subquery("counts")
     )
+    # if allowed_corpora is not None:
+    #     counts = counts.where(Corpus.import_id.in_(allowed_corpora))
+    # counts = counts.group_by(
+    #     Family.family_category, FamilyGeography.geography_id
+    # ).subquery("counts")
+    # _LOGGER.info(counts)
 
     # Aggregate family_category counts per geography into a JSONB object, and if a
     # family_category count is missing, set the count for that category to 0 so each
@@ -130,6 +145,9 @@ def _db_count_fams_in_category_and_geo(db: Session) -> Query:
         )
         .order_by(geo_family_combinations.c.display_value)
     )
+    print(query.statement.compile(dialect=postgresql.dialect()))
+    # print(str(query.statement.compile(compile_kwargs={"literal_binds": True})))
+    print(render_query(query, db))
     return query
 
 
@@ -150,15 +168,19 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
     )
 
 
-def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]:
+def get_world_map_stats(
+    db: Session, allowed_corpora: Optional[list[str]]
+) -> list[GeographyStatsDTO]:
     """
     Get a count of fam per category per geography for all geographies.
 
     :param db Session: The database session.
+    :param Optional[list[str]] allowed_corpora: The list of allowed
+        corpora IDs to filter on.
     :return list[GeographyStatsDTO]: A list of Geography stats objects
     """
     try:
-        family_geo_stats = _db_count_fams_in_category_and_geo(db).all()
+        family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora).all()
     except OperationalError as e:
         _LOGGER.error(e)
         raise RepositoryError("Error querying the database for geography stats")
@@ -168,3 +190,50 @@ def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]:
 
     result = [_to_dto(fgs) for fgs in family_geo_stats]
     return result
+
+
+from datetime import date, datetime, timedelta
+
+from sqlalchemy.orm import Query
+
+
+def render_query(statement, db_session):
+    """
+    Generate an SQL expression string with bound parameters rendered inline
+    for the given SQLAlchemy statement.
+    WARNING: This method of escaping is insecure, incomplete, and for debugging
+    purposes only. Executing SQL statements with inline-rendered user values is
+    extremely insecure.
+    Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query
+    """
+    if isinstance(statement, Query):
+        statement = statement.statement
+    dialect = db_session.bind.dialect
+
+    class LiteralCompiler(dialect.statement_compiler):
+        def visit_bindparam(
+            self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs
+        ):
+            return self.render_literal_value(bindparam.value, bindparam.type)
+
+        def render_array_value(self, val, item_type):
+            if isinstance(val, list):
+                return "{}".format(
+                    ",".join([self.render_array_value(x, item_type) for x in val])
+                )
+            return self.render_literal_value(val, item_type)
+
+        def render_literal_value(self, value, type_):
+            if isinstance(value, int):
+                return str(value)
+            elif isinstance(value, (str, date, datetime, timedelta)):
+                return "'{}'".format(str(value).replace("'", "''"))
+            elif isinstance(value, list):
+                return "'{{{}}}'".format(
+                    ",".join(
+                        [self.render_array_value(x, type_.item_type) for x in value]
+                    )
+                )
+            return super(LiteralCompiler, self).render_literal_value(value, type_)
+
+    return LiteralCompiler(dialect, statement).process(statement)
diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql
new file mode 100644
index 00000000..09a9db7e
--- /dev/null
+++ b/app/repository/sql/world_map.sql
@@ -0,0 +1,86 @@
+SELECT
+  geo_family_combinations.display_value AS display_value,
+  geo_family_combinations.slug AS slug,
+  geo_family_combinations.value AS value,
+  jsonb_object_agg (
+    geo_family_combinations.family_category,
+    coalesce(counts.records_count, 0)
+  ) AS counts
+FROM
+  (
+    SELECT
+      geography.id AS geography_id,
+      geography.display_value AS display_value,
+      geography.slug AS slug,
+      geography.value AS value,
+      anon_1.family_category AS family_category
+    FROM
+      geography,
+      (
+        SELECT DISTINCT
+          family.family_category AS family_category
+        FROM
+          family
+      ) AS anon_1
+  ) AS geo_family_combinations
+  LEFT OUTER JOIN (
+    SELECT
+      family.family_category AS family_category,
+      family_geography.geography_id AS geography_id,
+      count(*) AS records_count
+    FROM
+      family
+      JOIN family_geography ON family.import_id = family_geography.family_import_id
+    WHERE
+      CASE
+        WHEN (
+          NOT (
+            EXISTS (
+              SELECT
+                1
+              FROM
+                family_document
+              WHERE
+                family.import_id = family_document.family_import_id
+            )
+          )
+        ) THEN 'Created'
+        ELSE CASE
+          WHEN (
+            (
+              SELECT
+                count(family_document.document_status) AS count_1
+              FROM
+                family_document
+              WHERE
+                family_document.family_import_id = family.import_id
+                AND family_document.document_status = 'PUBLISHED'
+            ) > 0
+          ) THEN 'Published'
+          ELSE CASE
+            WHEN (
+              (
+                SELECT
+                  count(family_document.document_status) AS count_2
+                FROM
+                  family_document
+                WHERE
+                  family_document.family_import_id = family.import_id
+                  AND family_document.document_status = 'CREATED'
+              ) > 0
+            ) THEN 'Created'
+            ELSE 'Deleted'
+          END
+        END
+      END = 'Published'
+    GROUP BY
+      family.family_category,
+      family_geography.geography_id
+  ) AS counts ON geo_family_combinations.geography_id = counts.geography_id
+  AND geo_family_combinations.family_category = counts.family_category
+GROUP BY
+  geo_family_combinations.display_value,
+  geo_family_combinations.slug,
+  geo_family_combinations.value
+ORDER BY
+  geo_family_combinations.display_value;
diff --git a/makefile-docker.defs b/makefile-docker.defs
index 276a67f0..bcf2a0a2 100644
--- a/makefile-docker.defs
+++ b/makefile-docker.defs
@@ -123,7 +123,7 @@ test_non_search:
 	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS}
 
 test:
-	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS}
+	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/geographies/test_world_map_summary.py ${ARGS}
 
 # ----------------------------------
 # tasks
diff --git a/tests/conftest.py b/tests/conftest.py
index 6d6fb4cb..a60cfa3f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -117,7 +117,7 @@ def valid_token(monkeypatch):
     def mock_return(_, __, ___):
         return True
 
-    corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000"
+    corpora_ids = "CCLW.corpus.1.0,CCLW.corpus.2.0,CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000"
     subject = "CCLW"
     audience = "localhost"
     input_str = f"{corpora_ids};{subject};{audience}"
@@ -142,7 +142,7 @@ def mock_return(_, __, ___):
         return True
 
     corpora_ids = "UNFCCC.corpus.i00000001.n0000"
-    subject = "CPR"
+    subject = "CCLW"
     audience = "localhost"
     input_str = f"{corpora_ids};{subject};{audience}"
 
diff --git a/tests/non_search/conftest.py b/tests/non_search/conftest.py
deleted file mode 100644
index 114d68d9..00000000
--- a/tests/non_search/conftest.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import pytest
-from db_client.models.dfce import Geography
-from db_client.models.organisation import Organisation
-
-
-@pytest.fixture
-def summary_geography_family_data(test_db):
-    geos = [
-        Geography(
-            display_value="A place on the land", slug="a-place-on-the-land", value="XXX"
-        ),
-        Geography(
-            display_value="A place in the sea", slug="a-place-in-the-sea", value="YYY"
-        ),
-    ]
-    organisations = [Organisation(name="test org")]
-
-    test_db.add_all(geos)
-    test_db.add_all(organisations)
-    test_db.flush()
-
-    # Now setup the Documents/Families
-
-    ## WORKING HERE
-    documents = []
-    families = []
-
-    test_db.add_all(documents)
-    test_db.add_all(families)
-    test_db.flush()
-
-    # Now some events
-    events = []
-
-    test_db.add_all(events)
-
-    test_db.commit()
-    yield {
-        "db": test_db,
-        "docs": documents,
-        "families": documents,
-        "geos": geos,
-        "organisations": organisations,
-        "events": events,
-    }
diff --git a/tests/non_search/routers/geographies/setup_world_map_helpers.py b/tests/non_search/routers/geographies/setup_world_map_helpers.py
index 1752d0e2..b46f9f91 100644
--- a/tests/non_search/routers/geographies/setup_world_map_helpers.py
+++ b/tests/non_search/routers/geographies/setup_world_map_helpers.py
@@ -1,4 +1,7 @@
+from typing import Optional
+
 from db_client.functions.dfce_helpers import add_collections, add_document, add_families
+from fastapi import status
 from sqlalchemy.orm import Session
 
 from tests.non_search.setup_helpers import (
@@ -6,6 +9,26 @@
     get_default_documents,
 )
 
+WORLD_MAP_STATS_ENDPOINT = "/api/v1/geographies"
+TEST_HOST = "http://localhost:3000/"
+
+
+def _make_world_map_lookup_request(
+    client,
+    token,
+    expected_status_code: int = status.HTTP_200_OK,
+    origin: Optional[str] = TEST_HOST,
+):
+    headers = (
+        {"app-token": token}
+        if origin is None
+        else {"app-token": token, "origin": origin}
+    )
+
+    response = client.get(f"{WORLD_MAP_STATS_ENDPOINT}", headers=headers)
+    assert response.status_code == expected_status_code, response.text
+    return response.json()
+
 
 def _add_published_fams_and_docs(db: Session):
     # Collection
@@ -13,6 +36,27 @@ def _add_published_fams_and_docs(db: Session):
     add_collections(db, collections=[collection1])
 
     # Family + Document + events
+    document0 = {
+        "title": "UnfcccDocument0",
+        "slug": "UnfcccDocSlug0",
+        "md5_sum": None,
+        "url": "http://another_somewhere",
+        "content_type": None,
+        "import_id": "UNFCCC.executive.3.3",
+        "language_variant": None,
+        "status": "PUBLISHED",
+        "metadata": {"role": ["MAIN"], "type": ["Order"]},
+        "languages": [],
+        "events": [
+            {
+                "import_id": "UNFCCC.Event.3.0",
+                "title": "Published",
+                "date": "2019-12-25",
+                "type": "Passed/Approved",
+                "status": "OK",
+            }
+        ],
+    }
     document1, document2 = get_default_documents()
     document3 = {
         "title": "Document3",
@@ -36,6 +80,20 @@ def _add_published_fams_and_docs(db: Session):
         ],
     }
 
+    unfccc_fam = {
+        "import_id": "UNFCCC.family.0000.0",
+        "corpus_import_id": "UNFCCC.corpus.i00000001.n0000",
+        "title": "UnfcccFam0",
+        "slug": "UnfcccFamSlug0",
+        "description": "Summary0",
+        "geography_id": [2, 5],
+        "category": "UNFCCC",
+        "documents": [],
+        "metadata": {
+            "size": "small",
+            "color": "blue",
+        },
+    }
     family0 = {
         "import_id": "CCLW.family.0000.0",
         "corpus_import_id": "CCLW.corpus.i00000001.n0000",
@@ -93,11 +151,12 @@ def _add_published_fams_and_docs(db: Session):
         },
     }
 
+    unfccc_fam["documents"] = [document0]
     family0["documents"] = []
     family1["documents"] = [document1]
     family2["documents"] = [document2]
     family3["documents"] = [document3]
-    add_families(db, families=[family0, family1, family2, family3])
+    add_families(db, families=[family0, family1, family2, family3, unfccc_fam])
 
 
 def setup_all_docs_published_world_map(db: Session):
diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py
index 92bea770..c60c6d48 100644
--- a/tests/non_search/routers/geographies/test_world_map_summary.py
+++ b/tests/non_search/routers/geographies/test_world_map_summary.py
@@ -1,9 +1,16 @@
 import pytest
-from db_client.models.dfce.family import Family, FamilyGeography, FamilyStatus
+from db_client.models.dfce.family import (
+    Corpus,
+    Family,
+    FamilyCorpus,
+    FamilyGeography,
+    FamilyStatus,
+)
 from db_client.models.dfce.geography import Geography
 from fastapi import status
 
 from tests.non_search.routers.geographies.setup_world_map_helpers import (
+    _make_world_map_lookup_request,
     setup_all_docs_published_world_map,
     setup_mixed_doc_statuses_world_map,
 )
@@ -11,12 +18,8 @@
 EXPECTED_NUM_FAM_CATEGORIES = 3
 
 
-def _get_expected_keys():
-    return ["display_name", "iso_code", "slug", "family_counts"]
-
-
-def _url_under_test() -> str:
-    return "/api/v1/geographies"
+def _test_has_expected_keys(keys: list[str]) -> bool:
+    return set(["display_name", "iso_code", "slug", "family_counts"]) == set(keys)
 
 
 def _find_geography_index(lst, key, value):
@@ -34,8 +37,8 @@ def test_geo_table_populated(data_db):
 @pytest.mark.parametrize(
     ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
     [
-        ("India", 1, 1, 1),
-        ("Afghanistan", 0, 0, 1),
+        ("India", 1, 1, 2),
+        ("Afghanistan", 0, 0, 2),
     ],
 )
 def test_endpoint_returns_ok_all_docs_per_family_published(
@@ -45,20 +48,18 @@ def test_endpoint_returns_ok_all_docs_per_family_published(
     expected_exec,
     expected_leg,
     expected_unfccc,
+    valid_token,
 ):
     """Check endpoint returns 200 on success"""
     setup_all_docs_published_world_map(data_db)
-    response = data_client.get(_url_under_test())
-    assert response.status_code == status.HTTP_200_OK
-    resp_json = response.json()
+
+    resp_json = _make_world_map_lookup_request(data_client, valid_token)
     assert len(resp_json) > 1
 
     idx = _find_geography_index(resp_json, "display_name", geo_display_value)
     resp = resp_json[idx]
 
-    assert set(["display_name", "iso_code", "slug", "family_counts"]) == set(
-        resp.keys()
-    )
+    assert _test_has_expected_keys(resp.keys())
 
     family_geos = (
         data_db.query(Family)
@@ -72,20 +73,20 @@ def test_endpoint_returns_ok_all_docs_per_family_published(
     assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES
     assert sum(resp["family_counts"].values()) == len(family_geos)
 
+    assert resp["family_counts"]["EXECUTIVE"] == expected_exec
+    assert resp["family_counts"]["LEGISLATIVE"] == expected_leg
+    assert resp["family_counts"]["UNFCCC"] == expected_unfccc
     assert (
         sum(resp["family_counts"].values())
         == expected_exec + expected_leg + expected_unfccc
     )
-    assert resp["family_counts"]["EXECUTIVE"] == expected_exec
-    assert resp["family_counts"]["LEGISLATIVE"] == expected_leg
-    assert resp["family_counts"]["UNFCCC"] == expected_unfccc
 
 
 @pytest.mark.parametrize(
     ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
     [
-        ("India", 1, 1, 2),
-        ("Afghanistan", 0, 0, 1),
+        ("India", 0, 0, 2),
+        ("Afghanistan", 0, 0, 2),
     ],
 )
 def test_endpoint_returns_ok_some_docs_per_published_family_unpublished(
@@ -95,20 +96,18 @@ def test_endpoint_returns_ok_some_docs_per_published_family_unpublished(
     expected_exec,
     expected_leg,
     expected_unfccc,
+    valid_token,
 ):
     """Check endpoint returns 200 & discounts CREATED & DELETED docs"""
     setup_mixed_doc_statuses_world_map(data_db)
-    response = data_client.get(_url_under_test())
-    assert response.status_code == status.HTTP_200_OK
-    resp_json = response.json()
+
+    resp_json = _make_world_map_lookup_request(data_client, valid_token)
     assert len(resp_json) > 1
 
     idx = _find_geography_index(resp_json, "display_name", geo_display_value)
     resp = resp_json[idx]
 
-    assert set(["display_name", "iso_code", "slug", "family_counts"]) == set(
-        resp.keys()
-    )
+    assert _test_has_expected_keys(resp.keys())
 
     fams = (
         data_db.query(Family)
@@ -122,27 +121,90 @@ def test_endpoint_returns_ok_some_docs_per_published_family_unpublished(
     assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES
     assert sum(resp["family_counts"].values()) == len(fams)
 
+    assert resp["family_counts"]["EXECUTIVE"] == expected_exec
+    assert resp["family_counts"]["LEGISLATIVE"] == expected_leg
+    assert resp["family_counts"]["UNFCCC"] == expected_unfccc
     assert (
         sum(resp["family_counts"].values())
         == expected_exec + expected_leg + expected_unfccc
     )
-    assert resp["family_counts"]["EXECUTIVE"] == expected_exec
-    assert resp["family_counts"]["LEGISLATIVE"] == expected_leg
-    assert resp["family_counts"]["UNFCCC"] == expected_unfccc
 
 
-def test_endpoint_returns_404_when_not_found(data_client):
+def test_endpoint_returns_404_when_not_found(data_client, valid_token):
     """Test the endpoint returns a 404 when no world map stats found"""
-    response = data_client.get(_url_under_test())
-    assert response.status_code == status.HTTP_404_NOT_FOUND
-    data = response.json()
+    data = _make_world_map_lookup_request(
+        data_client, valid_token, expected_status_code=status.HTTP_404_NOT_FOUND
+    )
     assert data["detail"] == "No stats for world map found"
 
 
 @pytest.mark.skip(reason="Bad repo and rollback mocks need rewriting")
-def test_endpoint_returns_503_when_error(data_client):
+def test_endpoint_returns_503_when_error(data_client, valid_token):
     """Test the endpoint returns a 503 on db error"""
-    response = data_client.get(_url_under_test())
-    assert response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE
-    data = response.json()
+    data = _make_world_map_lookup_request(
+        data_client,
+        valid_token,
+        expected_status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+    )
     assert data["detail"] == "Database error"
+
+
+@pytest.mark.parametrize(
+    ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
+    [
+        ("India", 0, 0, 1),
+        ("Afghanistan", 0, 0, 1),
+    ],
+)
+def test_endpoint_returns_different_results_with_alt_token(
+    data_db,
+    data_client,
+    geo_display_value,
+    expected_exec,
+    expected_leg,
+    expected_unfccc,
+    alternative_token,
+):
+    """Check endpoint returns 200 & only counts CCLW docs"""
+    setup_all_docs_published_world_map(data_db)
+
+    fam = (
+        data_db.query(Family, FamilyCorpus.corpus_import_id)
+        .filter(Family.import_id == "UNFCCC.family.0000.0")
+        .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id)
+        .one()
+    )
+    assert fam
+    print(fam)
+
+    resp_json = _make_world_map_lookup_request(data_client, alternative_token)
+    assert len(resp_json) > 1
+
+    idx = _find_geography_index(resp_json, "display_name", geo_display_value)
+    resp = resp_json[idx]
+
+    assert _test_has_expected_keys(resp.keys())
+
+    fams = (
+        data_db.query(Family.import_id)
+        .filter(Family.family_status == FamilyStatus.PUBLISHED)
+        .filter(Geography.display_value == geo_display_value)
+        .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
+        .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id)
+        .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id)
+        .join(Geography, Geography.id == FamilyGeography.geography_id)
+        .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000")
+        .all()
+    )
+    print(fams)
+
+    assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES
+    assert sum(resp["family_counts"].values()) == len(fams)
+
+    assert resp["family_counts"]["EXECUTIVE"] == expected_exec
+    assert resp["family_counts"]["LEGISLATIVE"] == expected_leg
+    assert resp["family_counts"]["UNFCCC"] == expected_unfccc
+    assert (
+        sum(resp["family_counts"].values())
+        == expected_exec + expected_leg + expected_unfccc
+    )

From 84800988649d4332de738a884fc4048dcbd422dd Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 13:59:33 +0000
Subject: [PATCH 12/26] Merge in main

---
 .git-blame-ignore-revs                        |   4 +
 .trunk/configs/.sqlfluff                      |  30 +
 .trunk/trunk.yaml                             |  50 ++
 app/repository/document.py                    |  47 +-
 app/repository/download.py                    |  46 +-
 app/repository/geography.py                   |  51 --
 app/repository/helpers.py                     |   6 +-
 app/repository/sql/download.sql               | 596 +++++++++++-------
 app/repository/sql/pipeline.sql               | 503 ++++++++-------
 app/repository/sql/slug_lookup.sql            |  43 +-
 app/repository/sql/world_map.sql              |  82 +--
 makefile-docker.defs                          |   2 +-
 poetry.lock                                   |  10 +-
 pyproject.toml                                |   4 +-
 .../schemas/document_passage.sd               |  66 +-
 .../schemas/family_document.sd                |  92 +++
 16 files changed, 979 insertions(+), 653 deletions(-)
 create mode 100644 .trunk/configs/.sqlfluff

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 9a66c0e8..8c4c0200 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -23,3 +23,7 @@
 
 # Updating the test data file for document passages to be indent=2
 44624dcd1fa0835708bd9187a39bb0da8a31cd03
+
+# Fix SQL query formatting
+047766a85f086fc0986a6f2b49fee9d73fa219e8
+ab3476708920c5760f058ec40d14d008f94f5bad
diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff
new file mode 100644
index 00000000..de7aacd3
--- /dev/null
+++ b/.trunk/configs/.sqlfluff
@@ -0,0 +1,30 @@
+[sqlfluff]
+dialect = postgres
+exclude_rules = LT02, LT09
+
+[sqlfluff:indentation]
+indented_ctes = True
+
+[sqlfluff:layout:type:colon]
+spacing_before = single
+spacing_after = single
+
+[sqlfluff:layout:type:parameter]
+spacing_before = touch
+spacing_after = any
+
+[sqlfluff:rules:references.special_chars]
+allow_space_in_identifier = True
+additional_allowed_characters = ["/", "_", "-", "(", ")"]
+
+[sqlfluff:rules:capitalisation.keywords]
+capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.identifiers]
+extended_capitalisation_policy = lower
+
+[sqlfluff:rules:capitalisation.functions]
+extended_capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.types]
+extended_capitalisation_policy = upper
diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
index 6d746ff9..31ff5439 100644
--- a/.trunk/trunk.yaml
+++ b/.trunk/trunk.yaml
@@ -5,6 +5,14 @@ version: 0.1
 cli:
   version: 1.22.0
 
+tools:
+  definitions:
+    - name: sqlfluff
+      runtime: python
+      package: sqlfluff
+      shims: [sqlfluff]
+      known_good_version: 1.4.5
+
 # Trunk provides extensibility via plugins.
 # (https://docs.trunk.io/plugins)
 plugins:
@@ -27,6 +35,7 @@ lint:
   disabled:
     - hadolint
     - oxipng
+
   definitions:
     - name: bandit
       direct_configs: [bandit.yaml]
@@ -34,6 +43,45 @@ lint:
         - name: lint
           run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target}
 
+    - name: sqlfluff
+      files: [sql, sql-j2, dml, ddl]
+      tools: [sqlfluff]
+      description: A dialect-flexible and configurable SQL linter
+      known_good_version: 1.4.5
+      direct_configs:
+        - .sqlfluff
+      affects_cache:
+        - pyproject.toml
+      suggest_if: config_present
+      commands:
+        - name: lint
+          run: sqlfluff lint ${target} --format json --nofail
+          output: sarif
+          success_codes: [0]
+          read_output_from: stdout
+          parser:
+            runtime: python
+            run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py
+
+        - name: fix
+          version: ">=3.0.0"
+          run: sqlfluff fix ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
+        - name: format
+          run: sqlfluff format ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
   ignore:
     - linters: [ALL]
       paths:
@@ -45,6 +93,8 @@ lint:
         - LICENSE.md
 
   enabled:
+    - sqlfluff@3.2.5:
+        commands: [lint, fix, format]
     - actionlint@1.6.27
     - bandit@1.7.8
     - black@24.4.2
diff --git a/app/repository/document.py b/app/repository/document.py
index 8a3075f2..73d19d47 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Database helper functions for the documents entity."""
 
 import logging
 import os
@@ -22,9 +18,9 @@
 from db_client.models.dfce.metadata import FamilyMetadata
 from db_client.models.document.physical_document import PhysicalDocument
 from db_client.models.organisation.organisation import Organisation
-from sqlalchemy import func
-from sqlalchemy.engine import Result
+from sqlalchemy import bindparam, func, text
 from sqlalchemy.orm import Session
+from sqlalchemy.types import ARRAY, String
 
 from app.models.document import (
     CollectionOverviewResponse,
@@ -43,25 +39,6 @@
 _LOGGER = logging.getLogger(__file__)
 
 
-def get_slugged_object_from_allowed_corpora_query(
-    db: Session, slug_name: str, allowed_corpora_ids: list[str]
-) -> Result:
-    """Create download whole database query, replacing variables.
-
-    :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
-        should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
-    """
-    query_template = get_query_template(
-        os.path.join("app", "repository", "sql", "slug_lookup.sql")
-    )
-    return db.execute(
-        query_template,
-        {"slug_name": slug_name, "allowed_corpora_ids": allowed_corpora_ids},
-    )
-
-
 def get_slugged_objects(
     db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
 ) -> tuple[Optional[str], Optional[str]]:
@@ -78,8 +55,22 @@ def get_slugged_objects(
     :return tuple[Optional[str], Optional[str]]: the FamilyDocument
         import id or the Family import_id.
     """
-    if allowed_corpora is not None:
-        query = get_slugged_object_from_allowed_corpora_query(db, slug, allowed_corpora)
+    if allowed_corpora not in [None, []]:
+        query_template = text(
+            get_query_template(
+                os.path.join("app", "repository", "sql", "slug_lookup.sql")
+            )
+        )
+
+        query_template = query_template.bindparams(
+            bindparam("slug_name", type_=String),
+            bindparam(
+                "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)
+            ),
+        )
+        query = db.execute(
+            query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora}
+        )
     else:
         query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
             Slug.name == slug
diff --git a/app/repository/download.py b/app/repository/download.py
index 1ed90396..33592cc0 100644
--- a/app/repository/download.py
+++ b/app/repository/download.py
@@ -5,6 +5,8 @@
 
 import pandas as pd
 from fastapi import Depends
+from sqlalchemy import bindparam, text
+from sqlalchemy.types import ARRAY, DATETIME, String
 
 from app.clients.db.session import get_db
 from app.repository.helpers import get_query_template
@@ -12,32 +14,34 @@
 _LOGGER = getLogger(__name__)
 
 
-def create_query(
-    template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
+def get_whole_database_dump(
+    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
+):
+    """Get whole database dump and bind variables.
 
     :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
+    :param list[str] corpora_ids: The corpora from which we
         should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
+    :return pd.DataFrame: A DataFrame containing the results of the SQL
+        query that gets the whole database dump in our desired format.
     """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace(  # type: ignore
-        "{ingest_cycle_start}", ingest_cycle_start
-    ).replace(
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
-def get_whole_database_dump(
-    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
-):
-    query_template = get_query_template(
-        os.path.join("app", "repository", "sql", "download.sql")
+    query = text(
+        get_query_template(os.path.join("app", "repository", "sql", "download.sql"))
+    ).bindparams(
+        bindparam("ingest_cycle_start", type_=DATETIME),
+        bindparam(
+            "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String)
+        ),
     )
-    query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
 
     with db.connection() as conn:
-        df = pd.read_sql(query, conn.connection)
+        result = conn.execute(
+            query,
+            {
+                "ingest_cycle_start": ingest_cycle_start,
+                "allowed_corpora_ids": allowed_corpora_ids,
+            },
+        )
+        columns = result.keys()
+        df = pd.DataFrame(result.fetchall(), columns=columns)
         return df
diff --git a/app/repository/geography.py b/app/repository/geography.py
index d0eea962..b035fd70 100644
--- a/app/repository/geography.py
+++ b/app/repository/geography.py
@@ -4,9 +4,7 @@
 from typing import Optional, Sequence
 
 from db_client.models.dfce.family import (
-    Corpus,
     Family,
-    FamilyCorpus,
     FamilyDocument,
     FamilyGeography,
     FamilyStatus,
@@ -146,8 +144,6 @@ def _db_count_fams_in_category_and_geo(
         .order_by(geo_family_combinations.c.display_value)
     )
     print(query.statement.compile(dialect=postgresql.dialect()))
-    # print(str(query.statement.compile(compile_kwargs={"literal_binds": True})))
-    print(render_query(query, db))
     return query
 
 
@@ -190,50 +186,3 @@ def get_world_map_stats(
 
     result = [_to_dto(fgs) for fgs in family_geo_stats]
     return result
-
-
-from datetime import date, datetime, timedelta
-
-from sqlalchemy.orm import Query
-
-
-def render_query(statement, db_session):
-    """
-    Generate an SQL expression string with bound parameters rendered inline
-    for the given SQLAlchemy statement.
-    WARNING: This method of escaping is insecure, incomplete, and for debugging
-    purposes only. Executing SQL statements with inline-rendered user values is
-    extremely insecure.
-    Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query
-    """
-    if isinstance(statement, Query):
-        statement = statement.statement
-    dialect = db_session.bind.dialect
-
-    class LiteralCompiler(dialect.statement_compiler):
-        def visit_bindparam(
-            self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs
-        ):
-            return self.render_literal_value(bindparam.value, bindparam.type)
-
-        def render_array_value(self, val, item_type):
-            if isinstance(val, list):
-                return "{}".format(
-                    ",".join([self.render_array_value(x, item_type) for x in val])
-                )
-            return self.render_literal_value(val, item_type)
-
-        def render_literal_value(self, value, type_):
-            if isinstance(value, int):
-                return str(value)
-            elif isinstance(value, (str, date, datetime, timedelta)):
-                return "'{}'".format(str(value).replace("'", "''"))
-            elif isinstance(value, list):
-                return "'{{{}}}'".format(
-                    ",".join(
-                        [self.render_array_value(x, type_.item_type) for x in value]
-                    )
-                )
-            return super(LiteralCompiler, self).render_literal_value(value, type_)
-
-    return LiteralCompiler(dialect, statement).process(statement)
diff --git a/app/repository/helpers.py b/app/repository/helpers.py
index e976683b..958e0b38 100644
--- a/app/repository/helpers.py
+++ b/app/repository/helpers.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Helper functions for the repository layer."""
 
 from functools import lru_cache
 
diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql
index 8a807080..15bbf8ac 100644
--- a/app/repository/sql/download.sql
+++ b/app/repository/sql/download.sql
@@ -1,243 +1,355 @@
-WITH
-deduplicated_family_slugs as (
-  SELECT
-  distinct ON (slug.family_import_id)
-  slug.family_import_id, slug.created, slug.name
-  FROM (
-    SELECT
-    slug.family_import_id as "family_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_import_id is not null
-    group by slug.family_import_id
-    having count(*) > 1
-  ) duplicates
-  left join slug
-  on duplicates.family_import_id = slug.family_import_id
-  order by slug.family_import_id desc, slug.created desc, slug.ctid desc
-),
-unique_family_slugs as (
-  SELECT
-  distinct ON (slug.family_import_id)
-  slug.family_import_id, slug.created, slug.name
-  FROM (
-    SELECT
-    slug.family_import_id as "family_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_import_id is not null
-    group by slug.family_import_id
-    having count(*) = 1
-  ) non_duplicates
-  left join slug
-  on non_duplicates.family_import_id = slug.family_import_id
-  order by slug.family_import_id desc, slug.created desc, slug.ctid desc
-  ), most_recent_family_slugs as (
-  SELECT
-  deduplicated_family_slugs.family_import_id as "family_import_id",
-  deduplicated_family_slugs.created as "created",
-  deduplicated_family_slugs.name as "name"
-  FROM deduplicated_family_slugs
-  UNION ALL
-  SELECT
-  unique_family_slugs.family_import_id as "family_import_id",
-  unique_family_slugs.created as "created",
-  unique_family_slugs.name as "name"
-  FROM unique_family_slugs
-  order by family_import_id desc, created desc
-  ), deduplicated_doc_slugs as (
-  SELECT
-  distinct ON (slug.family_document_import_id)
-  slug.family_document_import_id,
-  slug.created,
-  slug.name
-  FROM (
-    SELECT
-    slug.family_document_import_id as "family_document_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_document_import_id is not null
-    group by slug.family_document_import_id
-    having count(*) >  1
-  ) duplicates
-  left join slug
-  on duplicates.family_document_import_id = slug.family_document_import_id
-  order by
-  slug.family_document_import_id desc, slug.created desc, slug.ctid desc
-),
-unique_doc_slugs as (
-  SELECT
-  distinct ON (slug.family_document_import_id)
-  slug.family_document_import_id,
-  slug.created,
-  slug.name
-  FROM (
-    SELECT
-    slug.family_document_import_id as "family_document_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_document_import_id is not null
-    group by slug.family_document_import_id
-    having count(*) = 1
-  ) non_duplicates
-  left join slug
-  on non_duplicates.family_document_import_id = slug.family_document_import_id
-  order by
-  slug.family_document_import_id desc, slug.created desc, slug.ctid desc
-  ), most_recent_doc_slugs as (
-  SELECT
-  deduplicated_doc_slugs.family_document_import_id
-  as "family_document_import_id",
-  deduplicated_doc_slugs.created,
-  deduplicated_doc_slugs.name
-  FROM deduplicated_doc_slugs
-  UNION ALL
-  SELECT
-  unique_doc_slugs.family_document_import_id as "family_document_import_id",
-  unique_doc_slugs.created,
-  unique_doc_slugs.name
-  FROM unique_doc_slugs
-  order by family_document_import_id desc, created desc
-  ), event_dates as (
-  SELECT
-      family_event.family_import_id AS family_import_id,
-      CASE
-          WHEN COUNT(*) FILTER (
-              WHERE family_event.event_type_name =
-              (family_event.valid_metadata->'datetime_event_name'->>0)
-          ) > 0 THEN
-              MIN(CASE
-                  WHEN family_event.event_type_name =
-                  (family_event.valid_metadata->'datetime_event_name'->>0)
-                  THEN family_event.date::TIMESTAMPTZ
-              END)
-          ELSE
-              MIN(family_event.date::TIMESTAMPTZ)
-      END AS published_date,
-      max(family_event.date::date) last_changed
-  FROM
-      family_event
-  GROUP BY
-      family_import_id
-)
+WITH deduplicated_family_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_import_id) slug.family_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_import_id
+                HAVING
+                    COUNT(*) > 1
+            ) AS duplicates
+        LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id
+        ORDER BY
+            slug.family_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+unique_family_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_import_id) slug.family_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_import_id
+                HAVING
+                    COUNT(*) = 1
+            ) AS non_duplicates
+        LEFT JOIN
+            slug
+            ON non_duplicates.family_import_id = slug.family_import_id
+        ORDER BY
+            slug.family_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+most_recent_family_slugs AS (
+        SELECT
+            deduplicated_family_slugs.family_import_id,
+            deduplicated_family_slugs.created,
+            deduplicated_family_slugs.name
+        FROM
+            deduplicated_family_slugs
+        UNION ALL
+        SELECT
+            unique_family_slugs.family_import_id,
+            unique_family_slugs.created,
+            unique_family_slugs.name
+        FROM
+            unique_family_slugs
+        ORDER BY
+            family_import_id DESC,
+            created DESC
+    ),
+
+deduplicated_doc_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_document_import_id) slug.family_document_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_document_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_document_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_document_import_id
+                HAVING
+                    COUNT(*) > 1
+            ) AS duplicates
+        LEFT JOIN
+            slug
+            ON
+                duplicates.family_document_import_id
+                = slug.family_document_import_id
+        ORDER BY
+            slug.family_document_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+unique_doc_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_document_import_id) slug.family_document_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_document_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_document_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_document_import_id
+                HAVING
+                    COUNT(*) = 1
+            ) AS non_duplicates
+        LEFT JOIN
+            slug
+            ON
+                non_duplicates.family_document_import_id
+                = slug.family_document_import_id
+        ORDER BY
+            slug.family_document_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+most_recent_doc_slugs AS (
+        SELECT
+            deduplicated_doc_slugs.family_document_import_id,
+            deduplicated_doc_slugs.created,
+            deduplicated_doc_slugs.name
+        FROM
+            deduplicated_doc_slugs
+        UNION ALL
+        SELECT
+            unique_doc_slugs.family_document_import_id,
+            unique_doc_slugs.created,
+            unique_doc_slugs.name
+        FROM
+            unique_doc_slugs
+        ORDER BY
+            family_document_import_id DESC,
+            created DESC
+    ),
+
+event_dates AS (
+        SELECT
+            family_event.family_import_id,
+            CASE
+                WHEN COUNT(*) FILTER (
+                    WHERE
+                        family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        )
+                ) > 0 THEN MIN(
+                    CASE
+                        WHEN family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        ) THEN family_event.date::TIMESTAMPTZ
+                    END
+                )
+                ELSE MIN(family_event.date::TIMESTAMPTZ)
+            END AS published_date,
+            MAX(family_event.date::DATE) AS last_changed
+        FROM
+            family_event
+        GROUP BY
+            family_event.family_import_id
+    ),
+
+fg AS (
+        SELECT
+            family_geography.family_import_id,
+            STRING_AGG(geography.value, ';') AS geo_isos,
+            STRING_AGG(geography.display_value, ';') AS geo_display_values
+        FROM
+            geography
+            INNER JOIN
+                family_geography
+                ON geography.id = family_geography.geography_id
+        GROUP BY
+            family_geography.family_import_id
+    ),
+
+n1 AS (
+        SELECT
+            collection_family.family_import_id,
+            STRING_AGG(collection.import_id, ';') AS collection_import_ids,
+            STRING_AGG(collection.title, ';') AS collection_titles,
+            STRING_AGG(collection.description, ';') AS collection_descriptions
+        FROM
+            collection
+            INNER JOIN
+                collection_family
+                ON collection.import_id = collection_family.collection_import_id
+        GROUP BY
+            collection_family.family_import_id
+    )
+
 SELECT
-ds.name as "Document ID",
-p.title as "Document Title",
-fs.name as "Family ID",
-f.title as "Family Title",
-f.description as "Family Summary",
-n1.collection_titles as "Collection Title(s)",
-n1.collection_descriptions as "Collection Description(s)",
-INITCAP(d.valid_metadata::json#>>'{
-  role,0}') as
-"Document Role",
-d.variant_name as "Document Variant",
-p.source_url as "Document Content URL",
-INITCAP(d.valid_metadata::json#>>'{
-  type,0}') as
-"Document Type",
-CASE
-  WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
-  ELSE INITCAP(f.family_category::TEXT)
-END "Category",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'framework')), ';')
-as "Framework",
-n2.language as "Language",
-o.name as "Source",
-fg.geo_isos as "Geography ISOs",
-fg.geo_display_values as "Geographies",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'topic')), ';')
-as "Topic/Response",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'hazard')), ';')
-as "Hazard",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'sector')), ';')
-as "Sector",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'keyword')), ';')
-as "Keyword",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'instrument')), ';')
-as "Instrument",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'author')), ';')
-as "Author",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'author_type')), ';')
-as "Author Type",
-fp.published_date as "First event in timeline",
-fp.last_changed as "Last event in timeline",
-n3.event_type_names as "Full timeline of events (types)",
-n3.event_dates as "Full timeline of events (dates)",
-d.created::date as "Date Added to System",
-f.last_modified::date as "Last ModIFied on System",
-d.import_id as "Internal Document ID",
-f.import_id as "Internal Family ID",
-n1.collection_import_ids as "Internal Collection ID(s)"
-FROM physical_document p
-JOIN family_document d
-ON p.id = d.physical_document_id
-JOIN family f
-ON d.family_import_id = f.import_id
-FULL JOIN (
-  SELECT
-  family_geography.family_import_id as "family_import_id",
-  string_agg(geography.value, ';') AS geo_isos,
-  string_agg(geography.display_value, ';') AS geo_display_values
-  FROM
-  geography
-  INNER JOIN family_geography
-  ON geography.id = family_geography.geography_id
-  GROUP BY family_geography.family_import_id
-) fg ON fg.family_import_id=f.import_id
-join family_corpus fc
-on f.import_id = fc.family_import_id
-join corpus c
-on fc.corpus_import_id = c.import_id
-join organisation o
-on c.organisation_id = o.id
-join family_metadata fm
-on fm.family_import_id = f.import_id
-FULL JOIN (
-  SELECT
-  collection_family.family_import_id as "family_import_id",
-  string_agg(collection.import_id, ';') AS collection_import_ids,
-  string_agg(collection.title, ';') AS collection_titles,
-  string_agg(collection.description, ';') AS collection_descriptions
-  FROM
-  collection
-  INNER JOIN collection_family
-  ON collection_family.collection_import_id = collection.import_id
-  GROUP BY collection_family.family_import_id
-) n1 ON n1.family_import_id=f.import_id
-left JOIN (
-  SELECT
-  p.id as "id",
-  string_agg(l.name, ';' ORDER BY l.name) AS language
-  FROM physical_document p
-  left join physical_document_language pdl
-  on pdl.document_id = p.id
-  left join language l
-  on l.id = pdl.language_id
-  GROUP  BY p.id
-) n2 ON n2.id=d.physical_document_id
-FULL JOIN (
-  SELECT
-  family_event.family_import_id,
-  string_agg(family_event.import_id, ';') AS event_import_ids,
-  string_agg(family_event.title, ';') AS event_titles,
-  string_agg(family_event.event_type_name, ';') AS event_type_names,
-  string_agg(family_event.date::date::text, ';') AS event_dates
-  FROM family_event
-  INNER JOIN  family ON family.import_id = family_event.family_import_id
-  GROUP BY family_event.family_import_id
-) n3 ON n3.family_import_id=f.import_id
-LEFT JOIN most_recent_doc_slugs ds
-on ds.family_document_import_id = d.import_id
-LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id
-LEFT JOIN event_dates fp on fp.family_import_id = f.import_id
-WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids})
-ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id
+    ds.name AS "Document ID",
+    p.title AS "Document Title",
+    fs.name AS "Family ID",
+    f.title AS "Family Title",
+    f.description AS "Family Summary",
+    n1.collection_titles AS "Collection Title(s)",
+    n1.collection_descriptions AS "Collection Description(s)",
+    d.variant_name AS "Document Variant",
+    p.source_url AS "Document Content URL",
+    language_agg.display_name AS "Language",
+    o.name AS "Source",
+    fg.geo_isos AS "Geography ISOs",
+    fg.geo_display_values AS "Geographies",
+    fp.published_date AS "First event in timeline",
+    fp.last_changed AS "Last event in timeline",
+    n3.event_type_names AS "Full timeline of events (types)",
+    n3.event_dates AS "Full timeline of events (dates)",
+    d.created::DATE AS "Date Added to System",
+    f.last_modified::DATE AS "Last ModIFied on System",
+    d.import_id AS "Internal Document ID",
+    f.import_id AS "Internal Family ID",
+    n1.collection_import_ids AS "Internal Collection ID(s)",
+    INITCAP(d.valid_metadata::JSON #>> '{
+  role,0}') AS "Document Role",
+    INITCAP(d.valid_metadata::JSON #>> '{
+  type,0}') AS "Document Type",
+    CASE
+        WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
+        ELSE INITCAP(f.family_category::TEXT)
+    END AS "Category",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework')
+        ),
+        ';'
+    ) AS "Framework",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic')
+        ),
+        ';'
+    ) AS "Topic/Response",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard')
+        ),
+        ';'
+    ) AS "Hazard",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector')
+        ),
+        ';'
+    ) AS "Sector",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword')
+        ),
+        ';'
+    ) AS "Keyword",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument')
+        ),
+        ';'
+    ) AS "Instrument",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author')
+        ),
+        ';'
+    ) AS "Author",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type')
+        ),
+        ';'
+    ) AS "Author Type"
+FROM
+    physical_document AS p
+    INNER JOIN family_document AS d ON p.id = d.physical_document_id
+    INNER JOIN family AS f ON d.family_import_id = f.import_id
+    FULL JOIN fg ON f.import_id = fg.family_import_id
+    INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id
+    INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id
+    INNER JOIN organisation AS o ON c.organisation_id = o.id
+    INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id
+    FULL JOIN n1 ON f.import_id = n1.family_import_id
+    LEFT JOIN (
+        SELECT
+            p.id,
+            STRING_AGG(
+                l.name,
+                ';'
+                ORDER BY
+                    l.name
+            ) AS display_name
+        FROM
+            physical_document AS p
+            LEFT JOIN
+                physical_document_language AS pdl
+                ON p.id = pdl.document_id
+            LEFT JOIN language AS l ON pdl.language_id = l.id
+        GROUP BY
+            p.id
+    ) AS language_agg ON d.physical_document_id = language_agg.id
+    FULL JOIN (
+        SELECT
+            family_event.family_import_id,
+            STRING_AGG(family_event.import_id, ';') AS event_import_ids,
+            STRING_AGG(family_event.title, ';') AS event_titles,
+            STRING_AGG(family_event.event_type_name, ';') AS event_type_names,
+            STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates
+        FROM
+            family_event
+            INNER JOIN
+                family
+                ON family_event.family_import_id = family.import_id
+        GROUP BY
+            family_event.family_import_id
+    ) AS n3 ON f.import_id = n3.family_import_id
+    LEFT JOIN
+        most_recent_doc_slugs AS ds
+        ON d.import_id = ds.family_document_import_id
+    LEFT JOIN
+        most_recent_family_slugs AS fs
+        ON f.import_id = fs.family_import_id
+    LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id
+WHERE
+    d.last_modified < :ingest_cycle_start
+    AND fc.corpus_import_id = ANY(:allowed_corpora_ids)
+ORDER BY
+    d.last_modified DESC,
+    d.created DESC,
+    d.ctid DESC,
+    n1.family_import_id ASC
diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql
index af6023e6..7a5d0e40 100644
--- a/app/repository/sql/pipeline.sql
+++ b/app/repository/sql/pipeline.sql
@@ -1,36 +1,43 @@
-WITH deduplicated_family_slugs AS (   SELECT
-        DISTINCT
-            ON (slug.family_import_id)   slug.family_import_id,
-            slug.created,
-            slug.name
+WITH deduplicated_family_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_import_id)
+        slug.family_import_id,
+        slug.created,
+        slug.name
     FROM
-        (     SELECT
-            slug.family_import_id AS "family_import_id",
-            Count(*) AS count
-        FROM
-            slug
-        WHERE
-            slug.family_import_id IS NOT NULL
-        GROUP BY
-            slug.family_import_id
-        HAVING
-            Count(*) > 1   ) duplicates
-    left join
+        (
+            SELECT
+                slug.family_import_id,
+                COUNT(*) AS count
+            FROM
+                slug
+            WHERE
+                slug.family_import_id IS NOT NULL
+            GROUP BY
+                slug.family_import_id
+            HAVING
+                COUNT(*) > 1
+        ) AS duplicates
+    LEFT JOIN
         slug
-            ON duplicates.family_import_id = slug.family_import_id
+        ON duplicates.family_import_id = slug.family_import_id
     ORDER BY
         slug.family_import_id DESC,
         slug.created DESC,
-        slug.ctid DESC ),
-        unique_family_slugs AS (   SELECT
-            DISTINCT
-                ON (slug.family_import_id)   slug.family_import_id,
-                slug.created,
-                slug.name
-        FROM
-            (     SELECT
-                slug.family_import_id AS "family_import_id",
-                Count(*) AS count
+        slug.ctid DESC
+),
+
+unique_family_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_import_id)
+        slug.family_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_import_id,
+                COUNT(*) AS count
             FROM
                 slug
             WHERE
@@ -38,219 +45,235 @@ WITH deduplicated_family_slugs AS (   SELECT
             GROUP BY
                 slug.family_import_id
             HAVING
-                Count(*) = 1   ) non_duplicates
-        left join
-            slug
-                ON non_duplicates.family_import_id = slug.family_import_id
-        ORDER BY
-            slug.family_import_id DESC,
-            slug.created DESC,
-            slug.ctid DESC   ),
-            most_recent_family_slugs AS (   SELECT
-                deduplicated_family_slugs.family_import_id AS "family_import_id",
-                deduplicated_family_slugs.created AS "created",
-                deduplicated_family_slugs.name AS "name"
-            FROM
-                deduplicated_family_slugs
-            UNION
-            ALL   SELECT
-                unique_family_slugs.family_import_id AS "family_import_id",
-                unique_family_slugs.created AS "created",
-                unique_family_slugs.name AS "name"
+                COUNT(*) = 1
+        ) AS non_duplicates
+    LEFT JOIN
+        slug
+        ON non_duplicates.family_import_id = slug.family_import_id
+    ORDER BY
+        slug.family_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+most_recent_family_slugs AS (
+    SELECT
+        deduplicated_family_slugs.family_import_id,
+        deduplicated_family_slugs.created,
+        deduplicated_family_slugs.name
+    FROM
+        deduplicated_family_slugs
+    UNION ALL
+    SELECT
+        unique_family_slugs.family_import_id,
+        unique_family_slugs.created,
+        unique_family_slugs.name
+    FROM
+        unique_family_slugs
+    ORDER BY
+        family_import_id DESC,
+        created DESC
+),
+
+deduplicated_doc_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_document_import_id)
+        slug.family_document_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_document_import_id,
+                COUNT(*) AS count
             FROM
-                unique_family_slugs
-            ORDER BY
-                family_import_id DESC,
-                created DESC   ), deduplicated_doc_slugs AS (   SELECT
-                DISTINCT
-                    ON (slug.family_document_import_id)   slug.family_document_import_id,
-                    slug.created,
-                    slug.name
+                slug
+            WHERE
+                slug.family_document_import_id IS NOT NULL
+            GROUP BY
+                slug.family_document_import_id
+            HAVING
+                COUNT(*) > 1
+        ) AS duplicates
+    LEFT JOIN
+        slug
+        ON
+            duplicates.family_document_import_id
+            = slug.family_document_import_id
+    ORDER BY
+        slug.family_document_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+unique_doc_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_document_import_id)
+        slug.family_document_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_document_import_id,
+                COUNT(*) AS count
             FROM
-                (     SELECT
-                    slug.family_document_import_id AS "family_document_import_id",
-                    Count(*) AS count
-                FROM
-                    slug
-                WHERE
-                    slug.family_document_import_id IS NOT NULL
-                GROUP BY
-                    slug.family_document_import_id
-                HAVING
-                    Count(*) >  1   ) duplicates
-            left join
                 slug
-                    ON duplicates.family_document_import_id = slug.family_document_import_id
-            ORDER BY
-                slug.family_document_import_id DESC,
-                slug.created DESC,
-                slug.ctid DESC ),
-                unique_doc_slugs AS (   SELECT
-                    DISTINCT
-                        ON (slug.family_document_import_id)   slug.family_document_import_id,
-                        slug.created,
-                        slug.name
-                FROM
-                    (     SELECT
-                        slug.family_document_import_id AS "family_document_import_id",
-                        Count(*) AS count
-                    FROM
-                        slug
-                    WHERE
-                        slug.family_document_import_id IS NOT NULL
-                    GROUP BY
-                        slug.family_document_import_id
-                    HAVING
-                        Count(*) = 1   ) non_duplicates
-                left join
-                    slug
-                        ON non_duplicates.family_document_import_id = slug.family_document_import_id
-                ORDER BY
-                    slug.family_document_import_id DESC,
-                    slug.created DESC,
-                    slug.ctid DESC   ),
-                    most_recent_doc_slugs AS (
-                        SELECT
-                            deduplicated_doc_slugs.family_document_import_id   AS "family_document_import_id",
-                            deduplicated_doc_slugs.created,
-                            deduplicated_doc_slugs.name
-                        FROM
-                            deduplicated_doc_slugs
-                        UNION
-                        ALL   SELECT
-                            unique_doc_slugs.family_document_import_id AS "family_document_import_id",
-                            unique_doc_slugs.created,
-                            unique_doc_slugs.name
-                        FROM
-                            unique_doc_slugs
-                        ORDER BY
-                            family_document_import_id DESC,
-                            created DESC
-                    ), event_dates AS (
-                        SELECT
-                            family_event.family_import_id AS family_import_id,
-                            CASE
-                                WHEN COUNT(*) FILTER (
-                                    WHERE family_event.event_type_name =
-                                    (family_event.valid_metadata->'datetime_event_name'->>0)
-                                ) > 0 THEN
-                                    MIN(CASE
-                                        WHEN family_event.event_type_name =
-                                        (family_event.valid_metadata->'datetime_event_name'->>0)
-                                        THEN family_event.date::TIMESTAMPTZ
-                                    END)
-                                ELSE
-                                    MIN(family_event.date::TIMESTAMPTZ)
-                            END AS published_date
-                        FROM
-                            family_event
-                        GROUP BY
-                            family_import_id
-                    )  SELECT
-                        f.title AS "family_title",
-                        p.title AS "physical_document_title",
-                        f.description AS "family_description",
-                        CASE
-                            WHEN f.family_category IN ('UNFCCC',
-                            'MCF') THEN Upper(f.family_category::text)
-                            ELSE Initcap(f.family_category::text)
-                        END "family_category",
-                        fp.published_date AS "family_published_date",
-                        d.import_id AS "family_document_import_id",
-                        ds.name AS "family_document_slug",
-                        f.import_id AS "family_import_id",
-                        fs.name AS "family_slug",
-                        p.source_url AS "physical_document_source_url",
-                        d.valid_metadata::json#>>'{type,0}' AS "family_document_type",
-                        o.name AS "organisation_name",
-                        geos.geographies AS "geographies",
-                        c.import_id AS "corpus_import_id",
-                        c.corpus_type_name AS "corpus_type_name",
-                        langs.languages AS "languages",
-                        fm.value AS "family_metadata",
-                        d.valid_metadata AS "family_document_metadata"
-                    FROM
-                        physical_document p
-                    join
-                        family_document d
-                            ON p.id = d.physical_document_id
-                    join
-                        family f
-                            ON d.family_import_id = f.import_id full
-                    join
-                        (
-                            SELECT
-                                family_geography.family_import_id AS "family_import_id",
-                                string_agg(geography.value,
-                                ';') AS geo_isos,
-                                string_agg(geography.display_value,
-                                ';') AS geo_display_values
-                            FROM
-                                geography
-                            inner join
-                                family_geography
-                                    ON geography.id = family_geography.geography_id
-                            GROUP BY
-                                family_geography.family_import_id
-                        ) fg
-                            ON fg.family_import_id=f.import_id
-                    join
-                        family_corpus fc
-                            ON f.import_id = fc.family_import_id
-                    join
-                        corpus c
-                            ON fc.corpus_import_id = c.import_id
-                    join
-                        organisation o
-                            ON c.organisation_id = o.id
-                    join
-                        family_metadata fm
-                            ON fm.family_import_id = f.import_id
-                    left outer join
-                        (
-                            SELECT
-                                family_document.import_id AS family_document_import_id,
-                                json_agg(DISTINCT(LANGUAGE.name)) AS languages
-                            FROM
-                                family_document
-                            join
-                                physical_document_language
-                                    ON physical_document_language.document_id = family_document.physical_document_id
-                            join
-                                LANGUAGE
-                                    ON LANGUAGE.id = physical_document_language.language_id
-                            GROUP BY
-                                family_document.import_id
-                        ) AS langs
-                            ON langs.family_document_import_id = d.import_id
-                    left outer join
-                        (
-                            SELECT
-                                family_geography.family_import_id AS family_import_id,
-                                json_agg(DISTINCT(geography.value)) AS geographies
-                            FROM
-                                family_geography
-                            join
-                                geography
-                                    ON geography.id = family_geography.geography_id
-                            GROUP BY
-                                family_geography.family_import_id
-                        ) AS geos
-                            ON geos.family_import_id = f.import_id
-                    left join
-                        most_recent_doc_slugs ds
-                            ON ds.family_document_import_id = d.import_id
-                    left join
-                        most_recent_family_slugs fs
-                            ON fs.family_import_id = f.import_id
-                    left join
-                        event_dates fp
-                            ON fp.family_import_id = f.import_id
+            WHERE
+                slug.family_document_import_id IS NOT NULL
+            GROUP BY
+                slug.family_document_import_id
+            HAVING
+                COUNT(*) = 1
+        ) AS non_duplicates
+    LEFT JOIN
+        slug
+        ON
+            non_duplicates.family_document_import_id
+            = slug.family_document_import_id
+    ORDER BY
+        slug.family_document_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+most_recent_doc_slugs AS (
+    SELECT
+        deduplicated_doc_slugs.family_document_import_id,
+        deduplicated_doc_slugs.created,
+        deduplicated_doc_slugs.name
+    FROM
+        deduplicated_doc_slugs
+    UNION ALL
+    SELECT
+        unique_doc_slugs.family_document_import_id,
+        unique_doc_slugs.created,
+        unique_doc_slugs.name
+    FROM
+        unique_doc_slugs
+    ORDER BY
+        family_document_import_id DESC,
+        created DESC
+),
+
+event_dates AS (
+    SELECT
+        family_event.family_import_id,
+        CASE
+            WHEN
+                COUNT(*) FILTER (
                     WHERE
-                        d.document_status != 'DELETED'
-                        AND fg.family_import_id = f.import_id
-                    ORDER BY
-                        d.last_modified DESC,
-                        d.created DESC,
-                        d.ctid DESC,
-                        f.import_id
+                    family_event.event_type_name = (
+                        family_event.valid_metadata
+                        -> 'datetime_event_name'
+                        ->> 0
+                    )
+                ) > 0
+                THEN MIN(
+                    CASE
+                        WHEN family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        ) THEN family_event.date::TIMESTAMPTZ
+                    END
+                )
+            ELSE MIN(family_event.date::TIMESTAMPTZ)
+        END AS published_date
+    FROM
+        family_event
+    GROUP BY
+        family_event.family_import_id
+),
+
+fg AS (
+    SELECT
+        family_geography.family_import_id,
+        STRING_AGG(geography.value, ';') AS geo_isos,
+        STRING_AGG(geography.display_value, ';') AS geo_display_values
+    FROM
+        geography
+    INNER JOIN
+        family_geography
+        ON geography.id = family_geography.geography_id
+    GROUP BY
+        family_geography.family_import_id
+),
+
+geos AS (
+    SELECT
+        family_geography.family_import_id,
+        JSON_AGG(DISTINCT geography.value) AS geographies
+    FROM
+        family_geography
+    INNER JOIN geography ON family_geography.geography_id = geography.id
+    GROUP BY
+        family_geography.family_import_id
+)
+
+SELECT
+    f.title AS family_title,
+    p.title AS physical_document_title,
+    f.description AS family_description,
+    fp.published_date AS family_published_date,
+    d.import_id AS family_document_import_id,
+    ds.name AS family_document_slug,
+    f.import_id AS family_import_id,
+    fs.name AS family_slug,
+    p.source_url AS physical_document_source_url,
+    o.name AS organisation_name,
+    geos.geographies,
+    c.import_id AS corpus_import_id,
+    c.corpus_type_name,
+    langs.languages,
+    fm.value AS family_metadata,
+    d.valid_metadata AS family_document_metadata,
+    CASE
+        WHEN
+            f.family_category IN ('UNFCCC', 'MCF')
+            THEN UPPER(f.family_category::TEXT)
+        ELSE INITCAP(f.family_category::TEXT)
+    END AS family_category,
+    d.valid_metadata::JSON #>> '{type,0}' AS family_document_type
+FROM
+    physical_document AS p
+INNER JOIN family_document AS d ON p.id = d.physical_document_id
+INNER JOIN family AS f ON d.family_import_id = f.import_id
+FULL JOIN fg ON f.import_id = fg.family_import_id
+INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id
+INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id
+INNER JOIN organisation AS o ON c.organisation_id = o.id
+INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id
+LEFT OUTER JOIN (
+    SELECT
+        family_document.import_id AS family_document_import_id,
+        JSON_AGG(DISTINCT language.name) AS languages
+    FROM
+        family_document
+    INNER JOIN
+        physical_document_language
+        ON
+            family_document.physical_document_id
+            = physical_document_language.document_id
+    INNER JOIN
+        language
+        ON physical_document_language.language_id = language.id
+    GROUP BY
+        family_document.import_id
+) AS langs ON d.import_id = langs.family_document_import_id
+LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id
+LEFT JOIN
+    most_recent_doc_slugs AS ds
+    ON d.import_id = ds.family_document_import_id
+LEFT JOIN
+    most_recent_family_slugs AS fs
+    ON f.import_id = fs.family_import_id
+LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id
+WHERE
+    d.document_status != 'DELETED'
+    AND fg.family_import_id = f.import_id
+ORDER BY
+    d.last_modified DESC,
+    d.created DESC,
+    d.ctid DESC,
+    f.import_id ASC
diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql
index 9d649067..09cb2e69 100644
--- a/app/repository/sql/slug_lookup.sql
+++ b/app/repository/sql/slug_lookup.sql
@@ -1,20 +1,33 @@
-SELECT
-    slug.family_document_import_id, slug.family_import_id
+-- First query for family document slugs
+SELECT DISTINCT
+    slug.family_document_import_id,
+    slug.family_import_id
 FROM slug
-LEFT JOIN family ON family.import_id = slug.family_import_id
-LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
-LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
-WHERE slug.name = '{slug_name}'
-AND corpus.import_id IN ({allowed_corpora_ids})
+    INNER JOIN family_document
+        ON slug.family_document_import_id = family_document.import_id
+    INNER JOIN family
+        ON family_document.family_import_id = family.import_id
+    INNER JOIN family_corpus
+        ON family.import_id = family_corpus.family_import_id
+    INNER JOIN corpus
+        ON family_corpus.corpus_import_id = corpus.import_id
+WHERE
+    slug.name = :slug_name
+    AND corpus.import_id = ANY(:allowed_corpora_ids)
 
 UNION
 
-SELECT
-    slug.family_document_import_id, slug.family_import_id
+-- Second query for family slugs
+SELECT DISTINCT
+    NULL AS family_document_import_id,
+    slug.family_import_id
 FROM slug
-LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id
-LEFT JOIN family ON family.import_id = family_document.family_import_id
-LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
-LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
-WHERE slug.name = '{slug_name}'
-AND corpus.import_id IN ({allowed_corpora_ids});
+    INNER JOIN family
+        ON slug.family_import_id = family.import_id
+    INNER JOIN family_corpus
+        ON family.import_id = family_corpus.family_import_id
+    INNER JOIN corpus
+        ON family_corpus.corpus_import_id = corpus.import_id
+WHERE
+    slug.name = :slug_name
+    AND corpus.import_id = ANY(:allowed_corpora_ids)
diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql
index 09a9db7e..51307212 100644
--- a/app/repository/sql/world_map.sql
+++ b/app/repository/sql/world_map.sql
@@ -1,36 +1,13 @@
-SELECT
-  geo_family_combinations.display_value AS display_value,
-  geo_family_combinations.slug AS slug,
-  geo_family_combinations.value AS value,
-  jsonb_object_agg (
-    geo_family_combinations.family_category,
-    coalesce(counts.records_count, 0)
-  ) AS counts
-FROM
-  (
-    SELECT
-      geography.id AS geography_id,
-      geography.display_value AS display_value,
-      geography.slug AS slug,
-      geography.value AS value,
-      anon_1.family_category AS family_category
-    FROM
-      geography,
-      (
-        SELECT DISTINCT
-          family.family_category AS family_category
-        FROM
-          family
-      ) AS anon_1
-  ) AS geo_family_combinations
-  LEFT OUTER JOIN (
+WITH counts AS (
     SELECT
-      family.family_category AS family_category,
-      family_geography.geography_id AS geography_id,
-      count(*) AS records_count
+      family.family_category,
+      family_geography.geography_id,
+      COUNT(*) AS records_count
     FROM
       family
-      JOIN family_geography ON family.import_id = family_geography.family_import_id
+      INNER JOIN
+          family_geography
+          ON family.import_id = family_geography.family_import_id
     WHERE
       CASE
         WHEN (
@@ -45,11 +22,10 @@ FROM
             )
           )
         ) THEN 'Created'
-        ELSE CASE
-          WHEN (
+        WHEN (
             (
               SELECT
-                count(family_document.document_status) AS count_1
+                COUNT(family_document.document_status) AS count_1
               FROM
                 family_document
               WHERE
@@ -57,11 +33,10 @@ FROM
                 AND family_document.document_status = 'PUBLISHED'
             ) > 0
           ) THEN 'Published'
-          ELSE CASE
-            WHEN (
+        WHEN (
               (
                 SELECT
-                  count(family_document.document_status) AS count_2
+                  COUNT(family_document.document_status) AS count_2
                 FROM
                   family_document
                 WHERE
@@ -69,14 +44,41 @@ FROM
                   AND family_document.document_status = 'CREATED'
               ) > 0
             ) THEN 'Created'
-            ELSE 'Deleted'
-          END
-        END
+        ELSE 'Deleted'
       END = 'Published'
     GROUP BY
       family.family_category,
       family_geography.geography_id
-  ) AS counts ON geo_family_combinations.geography_id = counts.geography_id
+  )
+
+SELECT
+  geo_family_combinations.display_value,
+  geo_family_combinations.slug,
+  geo_family_combinations.value,
+  JSONB_OBJECT_AGG(
+    geo_family_combinations.family_category,
+    COALESCE(counts.records_count, 0)
+  ) AS counts
+FROM
+  (
+    SELECT
+      geography.id AS geography_id,
+      geography.display_value,
+      geography.slug,
+      geography.value,
+      anon_1.family_category
+    FROM
+      geography,
+      (
+        SELECT DISTINCT
+          family.family_category
+        FROM
+          family
+      ) AS anon_1
+  ) AS geo_family_combinations
+  LEFT OUTER JOIN
+      counts
+      ON geo_family_combinations.geography_id = counts.geography_id
   AND geo_family_combinations.family_category = counts.family_category
 GROUP BY
   geo_family_combinations.display_value,
diff --git a/makefile-docker.defs b/makefile-docker.defs
index bcf2a0a2..b41a9358 100644
--- a/makefile-docker.defs
+++ b/makefile-docker.defs
@@ -123,7 +123,7 @@ test_non_search:
 	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS}
 
 test:
-	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/geographies/test_world_map_summary.py ${ARGS}
+	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS}
 
 # ----------------------------------
 # tasks
diff --git a/poetry.lock b/poetry.lock
index 2e443fb5..6d497b9e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -702,13 +702,13 @@ files = [
 
 [[package]]
 name = "cpr-sdk"
-version = "1.9.3"
+version = "1.9.5"
 description = ""
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "cpr_sdk-1.9.3-py3-none-any.whl", hash = "sha256:1cd725de96d3af7a1f74c5d5eab18b207702944549f945e5392a039242c948b8"},
-    {file = "cpr_sdk-1.9.3.tar.gz", hash = "sha256:f7ee60d81b2c9520cae237742582052472b5581601a8bb33862194bae8c4e0e1"},
+    {file = "cpr_sdk-1.9.5-py3-none-any.whl", hash = "sha256:dd32806499b5bb44c98be1f4135b88406f1a77abcf60c7d4f61ac740de979da3"},
+    {file = "cpr_sdk-1.9.5.tar.gz", hash = "sha256:addc22e557381935ac66c95721312c4a37080fda7419381971cdf6e7cb331fe0"},
 ]
 
 [package.dependencies]
@@ -4250,4 +4250,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "40e1911a3a0b0211027e7326d521d1fc9cdf8aa6c798279f1d5134a4bc2a5f57"
+content-hash = "d96225e60602c52c47631f4a12f8519bcb5d1e2f5d2b15c2f57760db6b28c33c"
diff --git a/pyproject.toml b/pyproject.toml
index 11d675fa..849d3b4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.13"
+version = "1.19.15"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]
@@ -10,7 +10,7 @@ python = "^3.10"
 Authlib = "^0.15.5"
 bcrypt = "^3.2.0"
 boto3 = "^1.26"
-cpr_sdk = { version = "1.9.3", extras = ["vespa"] }
+cpr_sdk = { version = "1.9.5", extras = ["vespa"] }
 fastapi = "^0.104.1"
 fastapi-health = "^0.4.0"
 fastapi-pagination = { extras = ["sqlalchemy"], version = "^0.12.19" }
diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
index 29492a3a..863929db 100644
--- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
+++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
@@ -1,5 +1,10 @@
 schema document_passage {
 
+    field text_block_not_stemmed type string {
+        indexing: input text_block | summary | index
+        stemming: none
+    }
+
     document document_passage {
 
         field search_weights_ref type reference<search_weights> {
@@ -134,6 +139,37 @@ schema document_passage {
         summary concepts {}
     }
 
+    document-summary search_summary_with_tokens {
+        summary family_name {}
+        summary family_description {}
+        summary family_import_id {}
+        summary family_slug {}
+        summary family_category {}
+        summary family_publication_ts {}
+        summary family_geography {}
+        summary family_geographies {}
+        summary family_source {}
+        summary document_import_id {}
+        summary document_slug {}
+        summary document_languages {}
+        summary document_content_type {}
+        summary document_cdn_object {}
+        summary document_source_url {}
+        summary corpus_import_id {}
+        summary corpus_type_name {}
+        summary metadata {}
+        summary text_block {}
+        summary text_block_id {}
+        summary text_block_type {}
+        summary text_block_page {}
+        summary text_block_coords {}
+        summary concepts {}
+        summary text_block_tokens {
+            source: text_block
+            tokens
+        }
+    }
+
     rank-profile exact inherits default {
         function text_score() {
             expression: attribute(passage_weight) * fieldMatch(text_block)
@@ -141,7 +177,17 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() fieldMatch(text_block)
+    }
+    
+    rank-profile exact_not_stemmed inherits default {
+        function text_score() {
+            expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed)
+        }
+        first-phase {
+            expression: text_score()
+        }
+        match-features: text_score() fieldMatch(text_block)
     }
 
     rank-profile hybrid_no_closeness inherits default {
@@ -151,7 +197,7 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() bm25(text_block)
     }
 
     rank-profile hybrid inherits default {
@@ -164,6 +210,20 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() bm25(text_block) closeness(text_embedding)
+    }
+    
+    rank-profile hybrid_custom_weight inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+            query(bm25_weight) double
+        }
+        function text_score() {
+            expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding))
+        }
+        first-phase {
+            expression: text_score()
+        }
+        match-features: text_score() bm25(text_block) closeness(text_embedding)
     }
 }
diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
index e62d6df5..e56963b2 100644
--- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
+++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
@@ -1,5 +1,15 @@
 schema family_document {
 
+    field family_name_not_stemmed type string {
+        indexing: input family_name_index | index
+        stemming: none
+    }
+
+    field family_description_not_stemmed type string {
+        indexing: input family_description_index | index
+        stemming: none
+    }
+
     document family_document {
 
         field search_weights_ref type reference<search_weights> {
@@ -170,6 +180,19 @@ schema family_document {
         }
         match-features: name_score() description_score()
     }
+    
+    rank-profile exact_not_stemmed inherits default {
+        function name_score() {
+            expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
 
     rank-profile hybrid_no_closeness inherits default {
         function name_score() {
@@ -199,6 +222,40 @@ schema family_document {
         }
         match-features: name_score() description_score()
     }
+    
+    rank-profile hybrid_no_description_embedding inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+        }
+        function name_score() {
+            expression: attribute(name_weight) * bm25(family_name_index)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * bm25(family_description_index)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
+
+    rank-profile hybrid_custom_weight inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+            query(bm25_weight) double
+        }
+        function name_score() {
+            expression: attribute(name_weight) * bm25(family_name_index)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * bm25(family_description_index)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
+
 
     document-summary search_summary {
         summary family_name {}
@@ -223,4 +280,39 @@ schema family_document {
         summary collection_title {}
         summary collection_summary {}
     }
+
+    document-summary search_summary_with_tokens {
+        summary family_name {}
+        summary family_description {}
+        summary family_import_id {}
+        summary family_slug {}
+        summary family_category {}
+        summary family_publication_ts {}
+        summary family_geography {}
+        summary family_geographies {}
+        summary family_source {}
+        summary document_import_id {}
+        summary document_title {}
+        summary document_slug {}
+        summary document_languages {}
+        summary document_content_type {}
+        summary document_cdn_object {}
+        summary document_source_url {}
+        summary metadata {}
+        summary corpus_import_id {}
+        summary corpus_type_name {}
+        summary collection_title {}
+        summary collection_summary {}
+        summary family_name_index {}
+        summary family_name_index_tokens {
+            source: family_name_index
+            tokens
+        }
+        summary family_description_index {}
+        summary family_description_index_tokens {
+            source: family_description_index
+            tokens
+        }
+        from-disk
+    }
 }

From a941cc19b9487147a1010d9a359de91674136d7a Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:01:33 +0000
Subject: [PATCH 13/26] Bump to 1.19.16

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 849d3b4b..b5593f7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.15"
+version = "1.19.16"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]

From 7dbf911a4587fa00a51c4cfd13b294bbcfa8a245 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:24:26 +0000
Subject: [PATCH 14/26] Fix docstring param name

---
 app/repository/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/repository/download.py b/app/repository/download.py
index 33592cc0..85dd1c47 100644
--- a/app/repository/download.py
+++ b/app/repository/download.py
@@ -20,7 +20,7 @@ def get_whole_database_dump(
     """Get whole database dump and bind variables.
 
     :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] corpora_ids: The corpora from which we
+    :param list[str] allowed_corpora_ids: The corpora from which we
         should allow the data to be dumped.
     :return pd.DataFrame: A DataFrame containing the results of the SQL
         query that gets the whole database dump in our desired format.

From b53e8a46ac0c12594b92de34bc62161a2def8581 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:40:22 +0000
Subject: [PATCH 15/26] Fix capitalisation

---
 app/repository/sql/download.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql
index 15bbf8ac..68f77685 100644
--- a/app/repository/sql/download.sql
+++ b/app/repository/sql/download.sql
@@ -227,7 +227,7 @@ SELECT
     n3.event_type_names AS "Full timeline of events (types)",
     n3.event_dates AS "Full timeline of events (dates)",
     d.created::DATE AS "Date Added to System",
-    f.last_modified::DATE AS "Last ModIFied on System",
+    f.last_modified::DATE AS "Last Modified on System",
     d.import_id AS "Internal Document ID",
     f.import_id AS "Internal Family ID",
     n1.collection_import_ids AS "Internal Collection ID(s)",

From a9bc84edeedbac69b224e6381a57f37a62eade9b Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:49:32 +0000
Subject: [PATCH 16/26] Fix capitalisation for MCF category

---
 app/repository/sql/download.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql
index 68f77685..c5ab3ab4 100644
--- a/app/repository/sql/download.sql
+++ b/app/repository/sql/download.sql
@@ -237,6 +237,7 @@ SELECT
   type,0}') AS "Document Type",
     CASE
         WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
+        WHEN f.family_category = 'MCF' THEN 'MCF'
         ELSE INITCAP(f.family_category::TEXT)
     END AS "Category",
     ARRAY_TO_STRING(

From 6ae9ee92f2fbdd477b44962898f5c19006bb529d Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:50:14 +0000
Subject: [PATCH 17/26] Fix world map query counts

---
 app/repository/sql/world_map.sql | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/app/repository/sql/world_map.sql b/app/repository/sql/world_map.sql
index 51307212..42db7a4d 100644
--- a/app/repository/sql/world_map.sql
+++ b/app/repository/sql/world_map.sql
@@ -5,11 +5,16 @@ WITH counts AS (
       COUNT(*) AS records_count
     FROM
       family
+      INNER JOIN
+          family_corpus
+          ON family.import_id = family_corpus.family_import_id
+      INNER JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id
       INNER JOIN
           family_geography
           ON family.import_id = family_geography.family_import_id
     WHERE
-      CASE
+      family_corpus.corpus_import_id = ANY(:allowed_corpora_ids)
+      AND CASE
         WHEN (
           NOT (
             EXISTS (

From 10b9dec1678c1dc4e9b63adadda511ceb00fa1be Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:02:52 +0000
Subject: [PATCH 18/26] Update world map stats repo function

---
 app/repository/geography.py | 110 ++++++++----------------------------
 1 file changed, 25 insertions(+), 85 deletions(-)

diff --git a/app/repository/geography.py b/app/repository/geography.py
index b035fd70..112257a5 100644
--- a/app/repository/geography.py
+++ b/app/repository/geography.py
@@ -1,22 +1,19 @@
 """Functions to support the geographies endpoint."""
 
 import logging
+import os
 from typing import Optional, Sequence
 
-from db_client.models.dfce.family import (
-    Family,
-    FamilyDocument,
-    FamilyGeography,
-    FamilyStatus,
-)
+from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography
 from db_client.models.dfce.geography import Geography
-from sqlalchemy import func
-from sqlalchemy.dialects import postgresql
+from sqlalchemy import bindparam, text
 from sqlalchemy.exc import OperationalError
 from sqlalchemy.orm import Query, Session
+from sqlalchemy.types import ARRAY, String
 
-from app.errors import RepositoryError
+from app.errors import RepositoryError, ValidationError
 from app.models.geography import GeographyStatsDTO
+from app.repository.helpers import get_query_template
 
 _LOGGER = logging.getLogger(__file__)
 
@@ -66,85 +63,30 @@ def get_geo_subquery(
 
 def _db_count_fams_in_category_and_geo(
     db: Session, allowed_corpora: Optional[list[str]]
-) -> Query:
-    """
-    Query the database for the fam count per category per geo.
-
-    NOTE: SqlAlchemy will make a complete hash of query generation if
-    columns are used in the query() call. Therefore, entire objects are
-    returned.
+) -> list[GeographyStatsDTO]:
+    """Query the database for the family count per category per geo.
 
     :param Session db: DB Session to perform query on.
     :param Optional[list[str]] allowed_corpora: The list of allowed
         corpora IDs to filter on.
-    :return Query: A Query object containing the queries to perform.
+    :return list[GeographyStatsDTO]: A list of counts of families by
+        category per geography.
     """
-    # Get the required Geography information and cross join each with all of the unique
-    # family_category values (so if some geographies have no documents for a particular
-    # family_category, we can set the count for that category to 0).
-    family_categories = db.query(Family.family_category).distinct().subquery()
-    geo_family_combinations = db.query(
-        Geography.id.label("geography_id"),
-        Geography.display_value,
-        Geography.slug,
-        Geography.value,
-        family_categories.c.family_category,
-    ).subquery("geo_family_combinations")
-
-    # Get a count of documents in each present family_category for each geography.
-    counts = (
-        db.query(
-            Family.family_category,
-            FamilyGeography.geography_id,
-            func.count().label("records_count"),
-        )
-        .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
-        # .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id)
-        # .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id)
-        .filter(Family.family_status == FamilyStatus.PUBLISHED)
-        .group_by(Family.family_category, FamilyGeography.geography_id)
-        .subquery("counts")
+    if allowed_corpora in [None, []]:
+        raise ValidationError("No allowed corpora provided")
+
+    query_template = text(
+        get_query_template(os.path.join("app", "repository", "sql", "world_map.sql"))
     )
-    # if allowed_corpora is not None:
-    #     counts = counts.where(Corpus.import_id.in_(allowed_corpora))
-    # counts = counts.group_by(
-    #     Family.family_category, FamilyGeography.geography_id
-    # ).subquery("counts")
-    # _LOGGER.info(counts)
-
-    # Aggregate family_category counts per geography into a JSONB object, and if a
-    # family_category count is missing, set the count for that category to 0 so each
-    # geography will always have a count for all family_category values.
-    query = (
-        db.query(
-            geo_family_combinations.c.display_value.label("display_value"),
-            geo_family_combinations.c.slug.label("slug"),
-            geo_family_combinations.c.value.label("value"),
-            func.jsonb_object_agg(
-                geo_family_combinations.c.family_category,
-                func.coalesce(counts.c.records_count, 0),
-            ).label("counts"),
-        )
-        .select_from(
-            geo_family_combinations.join(
-                counts,
-                (geo_family_combinations.c.geography_id == counts.c.geography_id)
-                & (
-                    geo_family_combinations.c.family_category
-                    == counts.c.family_category
-                ),
-                isouter=True,
-            )
-        )
-        .group_by(
-            geo_family_combinations.c.display_value,
-            geo_family_combinations.c.slug,
-            geo_family_combinations.c.value,
-        )
-        .order_by(geo_family_combinations.c.display_value)
+    query_template = query_template.bindparams(
+        bindparam("allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)),
     )
-    print(query.statement.compile(dialect=postgresql.dialect()))
-    return query
+
+    family_geo_stats = db.execute(
+        query_template, {"allowed_corpora_ids": allowed_corpora}
+    ).all()
+    results = [_to_dto(fgs) for fgs in family_geo_stats]
+    return results
 
 
 def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
@@ -176,13 +118,11 @@ def get_world_map_stats(
     :return list[GeographyStatsDTO]: A list of Geography stats objects
     """
     try:
-        family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora).all()
+        family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora)
     except OperationalError as e:
         _LOGGER.error(e)
         raise RepositoryError("Error querying the database for geography stats")
 
     if not family_geo_stats:
         return []
-
-    result = [_to_dto(fgs) for fgs in family_geo_stats]
-    return result
+    return family_geo_stats

From b2030b40a93c8af47c82a10632733873b9fa6320 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:08:05 +0000
Subject: [PATCH 19/26] Move get_world_map_stats to service layer

---
 app/repository/geography.py | 33 +++++---------------------------
 app/service/geography.py    | 38 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 28 deletions(-)
 create mode 100644 app/service/geography.py

diff --git a/app/repository/geography.py b/app/repository/geography.py
index 112257a5..3675b8b4 100644
--- a/app/repository/geography.py
+++ b/app/repository/geography.py
@@ -7,11 +7,10 @@
 from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography
 from db_client.models.dfce.geography import Geography
 from sqlalchemy import bindparam, text
-from sqlalchemy.exc import OperationalError
 from sqlalchemy.orm import Query, Session
 from sqlalchemy.types import ARRAY, String
 
-from app.errors import RepositoryError, ValidationError
+from app.errors import ValidationError
 from app.models.geography import GeographyStatsDTO
 from app.repository.helpers import get_query_template
 
@@ -61,14 +60,14 @@ def get_geo_subquery(
     return geo_subquery.subquery("geo_subquery")
 
 
-def _db_count_fams_in_category_and_geo(
-    db: Session, allowed_corpora: Optional[list[str]]
+def _count_families_per_category_in_each_geo(
+    db: Session, allowed_corpora: list[str]
 ) -> list[GeographyStatsDTO]:
     """Query the database for the family count per category per geo.
 
     :param Session db: DB Session to perform query on.
-    :param Optional[list[str]] allowed_corpora: The list of allowed
-        corpora IDs to filter on.
+    :param list[str] allowed_corpora: The list of allowed corpora IDs to
+        filter on.
     :return list[GeographyStatsDTO]: A list of counts of families by
         category per geography.
     """
@@ -104,25 +103,3 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
         slug=family_doc_geo_stats.slug,
         family_counts=family_doc_geo_stats.counts,
     )
-
-
-def get_world_map_stats(
-    db: Session, allowed_corpora: Optional[list[str]]
-) -> list[GeographyStatsDTO]:
-    """
-    Get a count of fam per category per geography for all geographies.
-
-    :param db Session: The database session.
-    :param Optional[list[str]] allowed_corpora: The list of allowed
-        corpora IDs to filter on.
-    :return list[GeographyStatsDTO]: A list of Geography stats objects
-    """
-    try:
-        family_geo_stats = _db_count_fams_in_category_and_geo(db, allowed_corpora)
-    except OperationalError as e:
-        _LOGGER.error(e)
-        raise RepositoryError("Error querying the database for geography stats")
-
-    if not family_geo_stats:
-        return []
-    return family_geo_stats
diff --git a/app/service/geography.py b/app/service/geography.py
new file mode 100644
index 00000000..244bac0c
--- /dev/null
+++ b/app/service/geography.py
@@ -0,0 +1,38 @@
+"""Functions to support the geographies endpoint."""
+
+import logging
+from typing import Optional
+
+from sqlalchemy.exc import OperationalError
+from sqlalchemy.orm import Session
+
+from app.errors import RepositoryError, ValidationError
+from app.models.geography import GeographyStatsDTO
+from app.repository.geography import _count_families_per_category_in_each_geo
+
+_LOGGER = logging.getLogger(__file__)
+
+
+def get_world_map_stats(
+    db: Session, allowed_corpora: Optional[list[str]]
+) -> list[GeographyStatsDTO]:
+    """
+    Get a count of fam per category per geography for all geographies.
+
+    :param db Session: The database session.
+    :param Optional[list[str]] allowed_corpora: The list of allowed
+        corpora IDs to filter on.
+    :return list[GeographyStatsDTO]: A list of Geography stats objects
+    """
+    if allowed_corpora is None or allowed_corpora == []:
+        raise ValidationError("No allowed corpora provided")
+
+    try:
+        family_geo_stats = _count_families_per_category_in_each_geo(db, allowed_corpora)
+    except OperationalError as e:
+        _LOGGER.error(e)
+        raise RepositoryError("Error querying the database for geography stats")
+
+    if not family_geo_stats:
+        return []
+    return family_geo_stats

From e473a433ad2a391f4149903f2fcfd8e347c0232f Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:09:35 +0000
Subject: [PATCH 20/26] Rename function and add world map service

---
 app/repository/geography.py                | 2 +-
 app/service/{geography.py => world_map.py} | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename app/service/{geography.py => world_map.py} (86%)

diff --git a/app/repository/geography.py b/app/repository/geography.py
index 3675b8b4..5b8c10b6 100644
--- a/app/repository/geography.py
+++ b/app/repository/geography.py
@@ -60,7 +60,7 @@ def get_geo_subquery(
     return geo_subquery.subquery("geo_subquery")
 
 
-def _count_families_per_category_in_each_geo(
+def count_families_per_category_in_each_geo(
     db: Session, allowed_corpora: list[str]
 ) -> list[GeographyStatsDTO]:
     """Query the database for the family count per category per geo.
diff --git a/app/service/geography.py b/app/service/world_map.py
similarity index 86%
rename from app/service/geography.py
rename to app/service/world_map.py
index 244bac0c..dc8735ce 100644
--- a/app/service/geography.py
+++ b/app/service/world_map.py
@@ -8,7 +8,7 @@
 
 from app.errors import RepositoryError, ValidationError
 from app.models.geography import GeographyStatsDTO
-from app.repository.geography import _count_families_per_category_in_each_geo
+from app.repository.geography import count_families_per_category_in_each_geo
 
 _LOGGER = logging.getLogger(__file__)
 
@@ -28,7 +28,7 @@ def get_world_map_stats(
         raise ValidationError("No allowed corpora provided")
 
     try:
-        family_geo_stats = _count_families_per_category_in_each_geo(db, allowed_corpora)
+        family_geo_stats = count_families_per_category_in_each_geo(db, allowed_corpora)
     except OperationalError as e:
         _LOGGER.error(e)
         raise RepositoryError("Error querying the database for geography stats")

From 83dc43e339f80493bde116a235cc89b3de6285a3 Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:10:45 +0000
Subject: [PATCH 21/26] Import from world map service

---
 app/api/api_v1/routers/geographies.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/geographies.py
index b92084d3..1ec466df 100644
--- a/app/api/api_v1/routers/geographies.py
+++ b/app/api/api_v1/routers/geographies.py
@@ -6,8 +6,8 @@
 from app.clients.db.session import get_db
 from app.errors import RepositoryError
 from app.models.geography import GeographyStatsDTO
-from app.repository.geography import get_world_map_stats
 from app.service.custom_app import AppTokenFactory
+from app.service.world_map import get_world_map_stats
 
 _LOGGER = logging.getLogger(__file__)
 
@@ -18,7 +18,7 @@
 async def geographies(
     request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
 ):
-    """Get a summary of fam stats for all geographies for world map."""
+    """Get a summary of family stats for all geographies for world map."""
     _LOGGER.info(
         "Getting world map counts for all geographies",
         extra={

From ab8c7019814d180a3989a723136f5f4cbe37b67d Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:46:46 +0000
Subject: [PATCH 22/26] Update counts

---
 .../routers/geographies/test_world_map_summary.py           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py
index c60c6d48..be24ff3a 100644
--- a/tests/non_search/routers/geographies/test_world_map_summary.py
+++ b/tests/non_search/routers/geographies/test_world_map_summary.py
@@ -85,7 +85,7 @@ def test_endpoint_returns_ok_all_docs_per_family_published(
 @pytest.mark.parametrize(
     ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
     [
-        ("India", 0, 0, 2),
+        ("India", 1, 1, 3),
         ("Afghanistan", 0, 0, 2),
     ],
 )
@@ -152,7 +152,7 @@ def test_endpoint_returns_503_when_error(data_client, valid_token):
 @pytest.mark.parametrize(
     ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
     [
-        ("India", 0, 0, 1),
+        ("India", 1, 1, 1),
         ("Afghanistan", 0, 0, 1),
     ],
 )
@@ -191,7 +191,7 @@ def test_endpoint_returns_different_results_with_alt_token(
         .filter(Geography.display_value == geo_display_value)
         .join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
         .join(FamilyCorpus, Family.import_id == FamilyCorpus.family_import_id)
-        .join(Corpus, Corpus.import_id == FamilyCorpus.family_import_id)
+        .join(Corpus, Corpus.import_id == FamilyCorpus.corpus_import_id)
         .join(Geography, Geography.id == FamilyGeography.geography_id)
         .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000")
         .all()

From 9078770b61f123d5fc8b08404ef3686d23a7e81e Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:48:48 +0000
Subject: [PATCH 23/26] Fix counts

---
 tests/non_search/routers/geographies/test_world_map_summary.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py
index be24ff3a..a53e1bdc 100644
--- a/tests/non_search/routers/geographies/test_world_map_summary.py
+++ b/tests/non_search/routers/geographies/test_world_map_summary.py
@@ -152,7 +152,7 @@ def test_endpoint_returns_503_when_error(data_client, valid_token):
 @pytest.mark.parametrize(
     ("geo_display_value", "expected_exec", "expected_leg", "expected_unfccc"),
     [
-        ("India", 1, 1, 1),
+        ("India", 0, 0, 1),
         ("Afghanistan", 0, 0, 1),
     ],
 )
@@ -175,7 +175,6 @@ def test_endpoint_returns_different_results_with_alt_token(
         .one()
     )
     assert fam
-    print(fam)
 
     resp_json = _make_world_map_lookup_request(data_client, alternative_token)
     assert len(resp_json) > 1

From b2a3671d7e2bdc8d9c35da1cbade8e4312fa186d Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:04:36 +0000
Subject: [PATCH 24/26] Rename router world map

---
 .../api_v1/routers/{geographies.py => world_map.py}    | 10 +++++++---
 app/main.py                                            |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)
 rename app/api/api_v1/routers/{geographies.py => world_map.py} (80%)

diff --git a/app/api/api_v1/routers/geographies.py b/app/api/api_v1/routers/world_map.py
similarity index 80%
rename from app/api/api_v1/routers/geographies.py
rename to app/api/api_v1/routers/world_map.py
index 1ec466df..219b14ff 100644
--- a/app/api/api_v1/routers/geographies.py
+++ b/app/api/api_v1/routers/world_map.py
@@ -4,17 +4,17 @@
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 
 from app.clients.db.session import get_db
-from app.errors import RepositoryError
+from app.errors import RepositoryError, ValidationError
 from app.models.geography import GeographyStatsDTO
 from app.service.custom_app import AppTokenFactory
 from app.service.world_map import get_world_map_stats
 
 _LOGGER = logging.getLogger(__file__)
 
-geographies_router = APIRouter()
+world_map_router = APIRouter()
 
 
-@geographies_router.get("/geographies", response_model=list[GeographyStatsDTO])
+@world_map_router.get("/geographies", response_model=list[GeographyStatsDTO])
 async def geographies(
     request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
 ):
@@ -42,6 +42,10 @@ async def geographies(
 
         return world_map_stats
     except RepositoryError as e:
+        _LOGGER.error(e)
         raise HTTPException(
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=e.message
         )
+    except ValidationError as e:
+        _LOGGER.error(e)
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=e.message)
diff --git a/app/main.py b/app/main.py
index e3e1296f..09438b37 100644
--- a/app/main.py
+++ b/app/main.py
@@ -16,11 +16,11 @@
 from app.api.api_v1.routers.admin import admin_document_router
 from app.api.api_v1.routers.auth import auth_router
 from app.api.api_v1.routers.documents import documents_router
-from app.api.api_v1.routers.geographies import geographies_router
 from app.api.api_v1.routers.lookups import lookups_router
 from app.api.api_v1.routers.pipeline_trigger import pipeline_trigger_router
 from app.api.api_v1.routers.search import search_router
 from app.api.api_v1.routers.summaries import summary_router
+from app.api.api_v1.routers.world_map import world_map_router
 from app.clients.db.session import SessionLocal, engine
 from app.service.auth import get_superuser_details
 from app.service.health import is_database_online
@@ -158,7 +158,7 @@ async def root():
     summary_router, prefix="/api/v1", tags=["Summaries"], include_in_schema=False
 )
 app.include_router(
-    geographies_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
+    world_map_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
 )
 
 # add pagination support to all routes that ask for it

From 1df2e9e08fc06717d275cc1da51250efd946026e Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:44:49 +0000
Subject: [PATCH 25/26] PR comment fixes

---
 .../non_search/routers/geographies/test_world_map_summary.py  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/non_search/routers/geographies/test_world_map_summary.py b/tests/non_search/routers/geographies/test_world_map_summary.py
index a53e1bdc..3155a94b 100644
--- a/tests/non_search/routers/geographies/test_world_map_summary.py
+++ b/tests/non_search/routers/geographies/test_world_map_summary.py
@@ -50,7 +50,6 @@ def test_endpoint_returns_ok_all_docs_per_family_published(
     expected_unfccc,
     valid_token,
 ):
-    """Check endpoint returns 200 on success"""
     setup_all_docs_published_world_map(data_db)
 
     resp_json = _make_world_map_lookup_request(data_client, valid_token)
@@ -165,7 +164,7 @@ def test_endpoint_returns_different_results_with_alt_token(
     expected_unfccc,
     alternative_token,
 ):
-    """Check endpoint returns 200 & only counts CCLW docs"""
+    """Check endpoint returns 200 & only counts UNFCCC docs"""
     setup_all_docs_published_world_map(data_db)
 
     fam = (
@@ -195,7 +194,6 @@ def test_endpoint_returns_different_results_with_alt_token(
         .filter(Corpus.import_id == "UNFCCC.corpus.i00000001.n0000")
         .all()
     )
-    print(fams)
 
     assert len(resp["family_counts"]) == EXPECTED_NUM_FAM_CATEGORIES
     assert sum(resp["family_counts"].values()) == len(fams)

From eeeb7e109d3ca26514f347ed9e1533eaa707d42f Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:45:10 +0000
Subject: [PATCH 26/26] Make representative of entity

---
 app/api/api_v1/routers/world_map.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/api/api_v1/routers/world_map.py b/app/api/api_v1/routers/world_map.py
index 219b14ff..78e7fa4c 100644
--- a/app/api/api_v1/routers/world_map.py
+++ b/app/api/api_v1/routers/world_map.py
@@ -15,10 +15,10 @@
 
 
 @world_map_router.get("/geographies", response_model=list[GeographyStatsDTO])
-async def geographies(
+async def world_map_stats(
     request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
 ):
-    """Get a summary of family stats for all geographies for world map."""
+    """Get a summary of family counts for all geographies for world map."""
     _LOGGER.info(
         "Getting world map counts for all geographies",
         extra={