diff --git a/app/api/api_v1/routers/lookups/config.py b/app/api/api_v1/routers/lookups/config.py index 71981ca8..d2151839 100644 --- a/app/api/api_v1/routers/lookups/config.py +++ b/app/api/api_v1/routers/lookups/config.py @@ -4,7 +4,7 @@ from app.api.api_v1.routers.lookups.router import lookups_router from app.clients.db.session import get_db -from app.models.metadata import ApplicationConfig +from app.models.config import ApplicationConfig from app.repository.lookups import get_config from app.service.custom_app import AppTokenFactory diff --git a/app/models/metadata.py b/app/models/config.py similarity index 54% rename from app/models/metadata.py rename to app/models/config.py index ab278085..2f8c2210 100644 --- a/app/models/metadata.py +++ b/app/models/config.py @@ -25,6 +25,32 @@ class OrganisationConfig(BaseModel): count_by_category: Mapping[str, int] +class CorpusConfig(BaseModel): + """Contains the Corpus and Organisation info as well as stats used on homepage""" + + # From corpus + corpus_import_id: str + title: str + description: str + image_url: str + text: str + # From organisation + organisation_name: str + organisation_id: int + # No of families in corpus + total: int + count_by_category: Mapping[str, int] + + +class CorpusTypeConfig(BaseModel): + """Contains the CorpusType info as well as data of any corpora of that type""" + + corpus_type_name: str + corpus_type_description: str + taxonomy: TaxonomyData + corpora: Sequence[CorpusConfig] + + class ApplicationConfig(BaseModel): """Definition of the new Config which just includes taxonomy.""" @@ -32,3 +58,4 @@ class ApplicationConfig(BaseModel): organisations: Mapping[str, OrganisationConfig] languages: Mapping[str, str] document_variants: Sequence[str] + corpus_types: Mapping[str, CorpusTypeConfig] diff --git a/app/repository/corpus.py b/app/repository/corpus.py new file mode 100644 index 00000000..4de0061f --- /dev/null +++ b/app/repository/corpus.py @@ -0,0 +1,51 @@ +from db_client.models.dfce.family import Corpus, Family, FamilyCorpus +from sqlalchemy import func +from sqlalchemy.orm import Session + + +def get_total_families_per_corpus(db: Session, corpus_import_id: str) -> int: + """ + Get the total number of families per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: The total number of families per corpus + """ + return ( + db.query(Family) + .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) + .filter(FamilyCorpus.corpus_import_id == corpus_import_id) + .count() + ) + + +def get_family_count_by_category_per_corpus(db: Session, corpus_import_id: str): + """ + Get the count of families by category per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: A list of tuples where each tuple contains a family category and its count + """ + return ( + db.query(Family.family_category, func.count()) + .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) + .filter(FamilyCorpus.corpus_import_id == corpus_import_id) + .group_by(Family.family_category) + .all() + ) + + +def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[Corpus]: + """ + Get the allowed corpora. + + :param db: Database session + :param allowed_corpora: A list of allowed corpora + :return: A list of Corpus objects that are allowed + """ + query = db.query(Corpus) + if allowed_corpora != []: + query = query.filter(Corpus.import_id.in_(allowed_corpora)) + + return query.all() diff --git a/app/repository/corpus_type.py b/app/repository/corpus_type.py new file mode 100644 index 00000000..86ae648c --- /dev/null +++ b/app/repository/corpus_type.py @@ -0,0 +1,13 @@ +from db_client.models.organisation import CorpusType +from sqlalchemy.orm import Session + + +def get(db: Session, corpus_type_name: str) -> CorpusType: + """ + Get a CorpusType object based on its name. + + :param db: Database session + :param corpus_type_name: The name of the corpus type + :return: A CorpusType object + """ + return db.query(CorpusType).filter(CorpusType.name == corpus_type_name).one() diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 38d0809f..5b33afe2 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -7,8 +7,9 @@ from sqlalchemy.exc import MultipleResultsFound from sqlalchemy.orm import Session -from app.models.metadata import ApplicationConfig +from app.models.config import ApplicationConfig from app.repository.organisation import get_organisation_config, get_organisations +from app.service.config import get_corpus_type_config_for_allowed_corpora from app.service.pipeline import IMPORT_ID_MATCHER from app.service.util import tree_table_to_json @@ -16,7 +17,6 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: - # First get the CCLW stats return ApplicationConfig( geographies=tree_table_to_json(table=Geography, db=db), organisations={ @@ -28,6 +28,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], + corpus_types=get_corpus_type_config_for_allowed_corpora(db, allowed_corpora), ) diff --git a/app/repository/organisation.py b/app/repository/organisation.py index f2947066..ba45c1a5 100644 --- a/app/repository/organisation.py +++ b/app/repository/organisation.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import Session from app import config -from app.models.metadata import CorpusData, OrganisationConfig +from app.models.config import CorpusData, OrganisationConfig def _to_corpus_data(row) -> CorpusData: @@ -94,3 +94,7 @@ def get_organisations(db: Session, allowed_corpora: list[str]) -> list[Organisat if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) return query.all() + + +def get(db: Session, org_id: int) -> Organisation: + return db.query(Organisation).filter(Organisation.id == org_id).one() diff --git a/app/service/config.py b/app/service/config.py new file mode 100644 index 00000000..f03e72be --- /dev/null +++ b/app/service/config.py @@ -0,0 +1,122 @@ +from typing import Any, Mapping, cast + +from db_client.models.dfce.family import FamilyCategory +from db_client.models.organisation import Corpus, CorpusType, Organisation +from sqlalchemy.orm import Session + +from app import config +from app.models.config import CorpusConfig, CorpusTypeConfig +from app.repository import corpus_type as corpus_type_repo +from app.repository import organisation as org_repo +from app.repository.corpus import ( + get_allowed_corpora, + get_family_count_by_category_per_corpus, + get_total_families_per_corpus, +) + + +def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: + """ + Get family statistics per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: A dictionary containing total families and count by category + """ + total = get_total_families_per_corpus(db, corpus_import_id) + + counts = get_family_count_by_category_per_corpus(db, corpus_import_id) + found_categories = {c[0].value: c[1] for c in counts} + count_by_category = {} + + # Supply zeros when there aren't any + for category in [e.value for e in FamilyCategory]: + if category in found_categories.keys(): + count_by_category[category] = found_categories[category] + else: + count_by_category[category] = 0 + + return {"total": total, "count_by_category": count_by_category} + + +def _to_corpus_type_config( + corpus: Corpus, + corpus_type: CorpusType, + organisation: Organisation, + stats: dict[str, Any], +) -> CorpusTypeConfig: + """ + Convert corpus, corpus type, organisation, and stats to CorpusTypeConfig. + + :param corpus: Corpus object + :param corpus_type: CorpusType object + :param organisation: Organisation object + :param stats: A dictionary containing statistics + :return: A CorpusTypeConfig object + """ + image_url = ( + f"https://{config.CDN_DOMAIN}/{corpus.corpus_image_url}" + if corpus.corpus_image_url is not None and len(str(corpus.corpus_image_url)) > 0 + else "" + ) + corpus_text = corpus.corpus_text if corpus.corpus_text is not None else "" + return CorpusTypeConfig( + corpus_type_name=str(corpus_type.name), + corpus_type_description=str(corpus_type.description), + taxonomy={**cast(dict, corpus_type.valid_metadata)}, + corpora=[ + CorpusConfig( + title=str(corpus.title), + description=str(corpus.description), + corpus_import_id=str(corpus.import_id), + text=str(corpus_text), + image_url=image_url, + organisation_id=int(str(organisation.id)), + organisation_name=str(organisation.name), + total=stats["total"], + count_by_category=stats["count_by_category"], + ) + ], + ) + + +def _get_config_for_corpus_type( + db: Session, corpus: Corpus +) -> dict[str, CorpusTypeConfig]: + """ + Get configuration for a corpus type. + + :param db: Database session + :param corpus: Corpus object + :return: A dictionary containing CorpusTypeConfig + """ + stats = _get_family_stats_per_corpus(db, str(corpus.import_id)) + corpus_type = corpus_type_repo.get(db, str(corpus.corpus_type_name)) + organisation = org_repo.get(db, int(str(corpus.organisation_id))) + return { + str(corpus_type.name): _to_corpus_type_config( + corpus, corpus_type, organisation, stats + ) + } + + +def get_corpus_type_config_for_allowed_corpora( + db: Session, allowed_corpora: list[str] +) -> Mapping[str, CorpusTypeConfig]: + """ + Get CorpusTypeConfig for allowed corpora. + + :param db: Database session + :param allowed_corpora: A list of allowed corpora + :return: A mapping of CorpusTypeConfig for allowed corpora + """ + corpora = get_allowed_corpora(db, allowed_corpora) + + configs_for_each_allowed_corpus = ( + _get_config_for_corpus_type(db, corpus) for corpus in corpora + ) + corpus_type_config_for_allowed_corpora = { + k: v for config in configs_for_each_allowed_corpus for k, v in config.items() + } + + return corpus_type_config_for_allowed_corpora diff --git a/pyproject.toml b/pyproject.toml index b72f5506..3351683c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.22" +version = "1.20.0" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 65fc56ce..5e3f84a1 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -76,27 +76,27 @@ def _add_family(test_db, import_id: str, cat: FamilyCategory, corpus_import_id): ) -def test_config_endpoint_content(data_client, data_db, valid_token): +def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_token): """Tests whether we get the expected content when the /config endpoint is called.""" # TODO: this test is fragile, we should look into validation according to the # supporting data, rather than counts & fixed lists url_under_test = "/api/v1/config" + app_token = app_token_factory( + "CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000" + ) - response = data_client.get(url_under_test, headers={"app-token": valid_token}) + response = data_client.get(url_under_test, headers={"app-token": app_token}) response_json = response.json() assert response.status_code == OK - assert ( - set(response_json.keys()) - ^ { - "geographies", - "organisations", - "document_variants", - "languages", - } - == set() - ) + assert set(response_json.keys()) == { + "geographies", + "organisations", + "document_variants", + "languages", + "corpus_types", + } assert "geographies" in response_json assert len(response_json["geographies"]) == 8 @@ -111,9 +111,51 @@ def test_config_endpoint_content(data_client, data_db, valid_token): assert len(response_json["document_variants"]) == 2 assert "Original Language" in response_json["document_variants"] - # Now test organisations - assert "organisations" in response_json + corpus_types = response_json["corpus_types"] + assert list(corpus_types.keys()) == ["Laws and Policies", "Intl. agreements"] + + laws_and_policies = corpus_types["Laws and Policies"] + assert laws_and_policies["corpus_type_name"] == "Laws and Policies" + assert laws_and_policies["corpus_type_description"] == "Laws and policies" + + taxonomy = laws_and_policies["taxonomy"] + assert set(taxonomy) ^ EXPECTED_CCLW_TAXONOMY == set() + # Check document roles. + assert "role" in taxonomy["_document"].keys() + assert len(taxonomy["_document"]["role"]["allowed_values"]) == 10 + assert "MAIN" in taxonomy["_document"]["role"]["allowed_values"] + # Check document roles. + assert "type" in taxonomy["_document"].keys() + assert len(taxonomy["_document"]["type"]["allowed_values"]) == 76 + assert "Adaptation Communication" in taxonomy["_document"]["type"]["allowed_values"] + # Check event types. + assert len(taxonomy["_event"]["event_type"]["allowed_values"]) == 17 + assert "Passed/Approved" in taxonomy["_event"]["event_type"]["allowed_values"] + + assert len(laws_and_policies["corpora"]) == 1 + cclw_corpus = laws_and_policies["corpora"][0] + + assert cclw_corpus["total"] == 0 + assert cclw_corpus["count_by_category"] == { + "Executive": 0, + "Legislative": 0, + "UNFCCC": 0, + "MCF": 0, + } + + assert cclw_corpus["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" + assert cclw_corpus["organisation_name"] == "CCLW" + assert cclw_corpus["organisation_id"] == 1 + assert ( + cclw_corpus["image_url"] + == "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png" + ) + assert "Grantham Research Institute" in cclw_corpus["text"] + assert cclw_corpus["description"] == "CCLW national policies" + assert cclw_corpus["title"] == "CCLW national policies" + # Below to be removed as part of PDCT-1759 + # Now test organisations assert "CCLW" in response_json["organisations"] cclw_org = response_json["organisations"]["CCLW"] assert len(cclw_org) == LEN_ORG_CONFIG @@ -198,6 +240,20 @@ def test_config_endpoint_cclw_stats(data_client, data_db, valid_token): response_json = response.json() + corpus_types = response_json["corpus_types"] + assert len(corpus_types) == 2 + + cclw_corpus_config = corpus_types["Laws and Policies"]["corpora"][0] + laws = cclw_corpus_config["count_by_category"]["Legislative"] + policies = cclw_corpus_config["count_by_category"]["Executive"] + unfccc = cclw_corpus_config["count_by_category"]["UNFCCC"] + assert laws == 2 + assert policies == 3 + assert unfccc == 1 + + assert cclw_corpus_config["total"] == laws + policies + unfccc + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"]["CCLW"] assert len(org_config) == LEN_ORG_CONFIG assert org_config["total"] == 6 @@ -259,6 +315,18 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only( response_json = response.json() + assert len(response_json["corpus_types"]) == 1 + + corpus = response_json["corpus_types"][expected_corpus_type.name]["corpora"][0] + assert corpus["total"] == 1 + assert corpus["count_by_category"] == { + "Executive": 0, + "Legislative": 1, + "MCF": 0, + "UNFCCC": 0, + } + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"] expected_org_config = { expected_organisation: { @@ -285,7 +353,7 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only( "MCF": 0, "UNFCCC": 0, }, - } + }, } assert org_config == expected_org_config @@ -323,20 +391,35 @@ def test_config_endpoint_returns_stats_for_all_orgs_if_no_allowed_corpora_in_app ) _add_family(data_db, "T.0.0.1", FamilyCategory.EXECUTIVE, cclw_corpus.import_id) - _add_family(data_db, "T.0.0.2", FamilyCategory.LEGISLATIVE, unfccc_corpus.import_id) + _add_family(data_db, "T.0.0.2", FamilyCategory.EXECUTIVE, unfccc_corpus.import_id) data_db.flush() response = data_client.get(url_under_test, headers={"app-token": app_token}) response_json = response.json() + + assert len(response_json["corpus_types"]) == 2 + corpus_types = response_json["corpus_types"] + + for corpus_type in list(corpus_types.values()): + for corpus in corpus_type["corpora"]: + assert corpus["total"] == 1 + assert corpus["count_by_category"] == { + "Executive": 1, + "Legislative": 0, + "MCF": 0, + "UNFCCC": 0, + } + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"] assert list(org_config.keys()) == ["CCLW", "UNFCCC"] assert org_config["CCLW"]["total"] == 1 assert org_config["UNFCCC"]["total"] == 1 assert org_config["UNFCCC"]["count_by_category"] == { - "Executive": 0, - "Legislative": 1, + "Executive": 1, + "Legislative": 0, "MCF": 0, "UNFCCC": 0, }