climatepolicyradar · annaCPR · Jan 6, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
@@ -4,7 +4,7 @@
 
 from app.api.api_v1.routers.lookups.router import lookups_router
 from app.clients.db.session import get_db
-from app.models.metadata import ApplicationConfig
+from app.models.config import ApplicationConfig
 from app.repository.lookups import get_config
 from app.service.custom_app import AppTokenFactory
 

@@ -25,10 +25,37 @@ class OrganisationConfig(BaseModel):
     count_by_category: Mapping[str, int]
 
 
+class CorpusConfig(BaseModel):
+    """Contains the Corpus and Organisation info as well as stats used on homepage"""
+
+    # From corpus
+    corpus_import_id: str
+    title: str
+    description: str
+    image_url: str
+    text: str
+    # From organisation
+    organisation_name: str
+    organisation_id: int
+    # No of families in corpus
+    total: int
+    count_by_category: Mapping[str, int]
+
+
+class CorpusTypeConfig(BaseModel):
+    """Contains the CorpusType info as well as data of any corpora of that type"""
+
+    corpus_type_name: str
+    corpus_type_description: str
+    taxonomy: TaxonomyData
+    corpora: Sequence[CorpusConfig]
+
+
 class ApplicationConfig(BaseModel):
     """Definition of the new Config which just includes taxonomy."""
 
     geographies: Sequence[dict]
     organisations: Mapping[str, OrganisationConfig]
     languages: Mapping[str, str]
     document_variants: Sequence[str]
+    corpus_types: Mapping[str, CorpusTypeConfig]
@@ -0,0 +1,108 @@
+from typing import Any, Mapping
+
+from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus
+from db_client.models.organisation import CorpusType, Organisation
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from app import config
+from app.models.config import CorpusConfig, CorpusTypeConfig
+
+
+def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]:
+    total = (
+        db.query(Family)
+        .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id)
+        .filter(FamilyCorpus.corpus_import_id == corpus_import_id)
+        .count()
+    )
+
+    counts = (
+        db.query(Family.family_category, func.count())
+        .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id)
+        .filter(FamilyCorpus.corpus_import_id == corpus_import_id)
+        .group_by(Family.family_category)
+        .all()
+    )
+    found_categories = {c[0].value: c[1] for c in counts}
+    count_by_category = {}
+
+    # Supply zeros when there aren't any
+    for category in [e.value for e in FamilyCategory]:
+        if category in found_categories.keys():
+            count_by_category[category] = found_categories[category]
+        else:
+            count_by_category[category] = 0
+
+    return {"total": total, "count_by_category": count_by_category}
+
+
+def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeConfig]:
+    image_url = (
+        f"https://{config.CDN_DOMAIN}/{row.image_url}"
+        if row.image_url is not None and len(row.image_url) > 0
+        else ""
+    )
+    corpus_text = row.text if row.text is not None else ""
+    return {
+        row.corpus_type: CorpusTypeConfig(
+            corpus_type_name=row.corpus_type_name,
+            corpus_type_description=row.corpus_type_description,
+            taxonomy={**row.taxonomy},
+            corpora=[
+                CorpusConfig(
+                    title=row.title,
+                    description=row.description,
+                    corpus_import_id=row.corpus_import_id,
+                    text=corpus_text,
+                    image_url=image_url,
+                    organisation_id=row.organisation_id,
+                    organisation_name=row.organisation_name,
+                    total=stats["total"],
+                    count_by_category=stats["count_by_category"],
+                )
+            ],
+        )
+    }
+
+
+def _get_config_for_corpus(db: Session, row) -> dict[str, CorpusTypeConfig]:
+    stats = _get_family_stats_per_corpus(db, row.corpus_import_id)
+    return _to_corpus_type_config(row, stats)
+
+
+def get_config_for_allowed_corpora(
+    db: Session, allowed_corpora: list[str]
+) -> Mapping[str, CorpusTypeConfig]:
+    query = (
+        db.query(
+            Corpus.import_id.label("corpus_import_id"),
+            Corpus.title.label("title"),
+            Corpus.description.label("description"),
+            Corpus.corpus_image_url.label("image_url"),
+            Corpus.corpus_text.label("text"),
+            Corpus.corpus_type_name.label("corpus_type"),
+            CorpusType.name.label("corpus_type_name"),
+            CorpusType.description.label("corpus_type_description"),
+            CorpusType.valid_metadata.label("taxonomy"),
+            Organisation.id.label("organisation_id"),
+            Organisation.name.label("organisation_name"),
+        )
+        .join(
+            CorpusType,
+            Corpus.corpus_type_name == CorpusType.name,
+        )
+        .join(Organisation, Corpus.organisation_id == Organisation.id)
+    )
+    if allowed_corpora != []:
+        query = query.filter(Corpus.import_id.in_(allowed_corpora))
+
+    corpora = query.all()
+    configs_for_each_allowed_corpus = (
+        _get_config_for_corpus(db, row) for row in corpora
+    )
+    config_for_allowed_corpora = {
+        k: v for d in configs_for_each_allowed_corpus for k, v in d.items()
+    }
+
+    return config_for_allowed_corpora
@@ -7,7 +7,8 @@
 from sqlalchemy.exc import MultipleResultsFound
 from sqlalchemy.orm import Session
 
-from app.models.metadata import ApplicationConfig
+from app.models.config import ApplicationConfig
+from app.repository.corpus import get_config_for_allowed_corpora
 from app.repository.organisation import get_organisation_config, get_organisations
 from app.service.pipeline import IMPORT_ID_MATCHER
 from app.service.util import tree_table_to_json
@@ -28,6 +29,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig:
             variant.variant_name
             for variant in db.query(Variant).order_by(Variant.variant_name).all()
         ],
+        corpus_types=get_config_for_allowed_corpora(db, allowed_corpora),
     )
 
 

@@ -6,7 +6,7 @@
 from sqlalchemy.orm import Session
 
 from app import config
-from app.models.metadata import CorpusData, OrganisationConfig
+from app.models.config import CorpusData, OrganisationConfig
 
 
 def _to_corpus_data(row) -> CorpusData:

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.21"
+version = "1.20.0"
 description = ""
 authors = ["CPR-dev-team <[email protected]>"]
 packages = [{ include = "app" }, { include = "tests" }]

@@ -76,27 +76,27 @@ def _add_family(test_db, import_id: str, cat: FamilyCategory, corpus_import_id):
     )
 
 
-def test_config_endpoint_content(data_client, data_db, valid_token):
+def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_token):
     """Tests whether we get the expected content when the /config endpoint is called."""
     # TODO: this test is fragile, we should look into validation according to the
     #       supporting data, rather than counts & fixed lists
     url_under_test = "/api/v1/config"
+    app_token = app_token_factory(
+        "CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000"
+    )
 
-    response = data_client.get(url_under_test, headers={"app-token": valid_token})
+    response = data_client.get(url_under_test, headers={"app-token": app_token})
 
     response_json = response.json()
 
     assert response.status_code == OK
-    assert (
-        set(response_json.keys())
-        ^ {
-            "geographies",
-            "organisations",
-            "document_variants",
-            "languages",
-        }
-        == set()
-    )
+    assert set(response_json.keys()) == {
+        "geographies",
+        "organisations",
+        "document_variants",
+        "languages",
+        "corpus_types",
+    }
 
     assert "geographies" in response_json
     assert len(response_json["geographies"]) == 8
@@ -111,9 +111,51 @@ def test_config_endpoint_content(data_client, data_db, valid_token):
     assert len(response_json["document_variants"]) == 2
     assert "Original Language" in response_json["document_variants"]
 
-    # Now test organisations
-    assert "organisations" in response_json
+    corpus_types = response_json["corpus_types"]
+    assert list(corpus_types.keys()) == ["Laws and Policies", "Intl. agreements"]
+
+    laws_and_policies = corpus_types["Laws and Policies"]
+    assert laws_and_policies["corpus_type_name"] == "Laws and Policies"
+    assert laws_and_policies["corpus_type_description"] == "Laws and policies"
+
+    taxonomy = laws_and_policies["taxonomy"]
+    assert set(taxonomy) ^ EXPECTED_CCLW_TAXONOMY == set()
+    # Check document roles.
+    assert "role" in taxonomy["_document"].keys()
+    assert len(taxonomy["_document"]["role"]["allowed_values"]) == 10
+    assert "MAIN" in taxonomy["_document"]["role"]["allowed_values"]
+    # Check document roles.
+    assert "type" in taxonomy["_document"].keys()
+    assert len(taxonomy["_document"]["type"]["allowed_values"]) == 76
+    assert "Adaptation Communication" in taxonomy["_document"]["type"]["allowed_values"]
+    # Check event types.
+    assert len(taxonomy["_event"]["event_type"]["allowed_values"]) == 17
+    assert "Passed/Approved" in taxonomy["_event"]["event_type"]["allowed_values"]
+
+    assert len(laws_and_policies["corpora"]) == 1
+    cclw_corpus = laws_and_policies["corpora"][0]
+
+    assert cclw_corpus["total"] == 0
+    assert cclw_corpus["count_by_category"] == {
+        "Executive": 0,
+        "Legislative": 0,
+        "UNFCCC": 0,
+        "MCF": 0,
+    }
+
+    assert cclw_corpus["corpus_import_id"] == "CCLW.corpus.i00000001.n0000"
+    assert cclw_corpus["organisation_name"] == "CCLW"
+    assert cclw_corpus["organisation_id"] == 1
+    assert (
+        cclw_corpus["image_url"]
+        == "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png"
+    )
+    assert "Grantham Research Institute" in cclw_corpus["text"]
+    assert cclw_corpus["description"] == "CCLW national policies"
+    assert cclw_corpus["title"] == "CCLW national policies"
 
+    # Below to be removed as part of PDCT-1759
+    # Now test organisations
     assert "CCLW" in response_json["organisations"]
     cclw_org = response_json["organisations"]["CCLW"]
     assert len(cclw_org) == LEN_ORG_CONFIG
@@ -198,6 +240,20 @@ def test_config_endpoint_cclw_stats(data_client, data_db, valid_token):
 
     response_json = response.json()
 
+    corpus_types = response_json["corpus_types"]
+    assert len(corpus_types) == 2
+
+    cclw_corpus_config = corpus_types["Laws and Policies"]["corpora"][0]
+    laws = cclw_corpus_config["count_by_category"]["Legislative"]
+    policies = cclw_corpus_config["count_by_category"]["Executive"]
+    unfccc = cclw_corpus_config["count_by_category"]["UNFCCC"]
+    assert laws == 2
+    assert policies == 3
+    assert unfccc == 1
+
+    assert cclw_corpus_config["total"] == laws + policies + unfccc
+
+    # Below to be removed as part of PDCT-1759
     org_config = response_json["organisations"]["CCLW"]
     assert len(org_config) == LEN_ORG_CONFIG
     assert org_config["total"] == 6
@@ -259,6 +315,18 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only(
 
     response_json = response.json()
 
+    assert len(response_json["corpus_types"]) == 1
+
+    corpus = response_json["corpus_types"][expected_corpus_type.name]["corpora"][0]
+    assert corpus["total"] == 1
+    assert corpus["count_by_category"] == {
+        "Executive": 0,
+        "Legislative": 1,
+        "MCF": 0,
+        "UNFCCC": 0,
+    }
+
+    #  Below to be removed as part of PDCT-1759
     org_config = response_json["organisations"]
     expected_org_config = {
         expected_organisation: {
@@ -285,7 +353,7 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only(
                 "MCF": 0,
                 "UNFCCC": 0,
             },
-        }
+        },
     }
     assert org_config == expected_org_config
 
@@ -323,20 +391,35 @@ def test_config_endpoint_returns_stats_for_all_orgs_if_no_allowed_corpora_in_app
     )
 
     _add_family(data_db, "T.0.0.1", FamilyCategory.EXECUTIVE, cclw_corpus.import_id)
-    _add_family(data_db, "T.0.0.2", FamilyCategory.LEGISLATIVE, unfccc_corpus.import_id)
+    _add_family(data_db, "T.0.0.2", FamilyCategory.EXECUTIVE, unfccc_corpus.import_id)
     data_db.flush()
 
     response = data_client.get(url_under_test, headers={"app-token": app_token})
 
     response_json = response.json()
+
+    assert len(response_json["corpus_types"]) == 2
+    corpus_types = response_json["corpus_types"]
+
+    for corpus_type in list(corpus_types.values()):
+        for corpus in corpus_type["corpora"]:
+            assert corpus["total"] == 1
+            assert corpus["count_by_category"] == {
+                "Executive": 1,
+                "Legislative": 0,
+                "MCF": 0,
+                "UNFCCC": 0,
+            }
+
+    #  Below to be removed as part of PDCT-1759
     org_config = response_json["organisations"]
 
     assert list(org_config.keys()) == ["CCLW", "UNFCCC"]
     assert org_config["CCLW"]["total"] == 1
     assert org_config["UNFCCC"]["total"] == 1
     assert org_config["UNFCCC"]["count_by_category"] == {
-        "Executive": 0,
-        "Legislative": 1,
+        "Executive": 1,
+        "Legislative": 0,
         "MCF": 0,
         "UNFCCC": 0,
     }